Spaces:

Noumida
/

indic-lid_trans2

Sleeping

App Files Files Community

Noumida commited on Aug 31

Commit

b7caabd

verified ·

1 Parent(s): 680f904

Update app.py

Browse files

Files changed (1) hide show

app.py +194 -169

app.py CHANGED Viewed

@@ -3,31 +3,43 @@ import torch
 import sys
 import os
 from pathlib import Path
-# Run setup if models don't exist
 if not os.path.exists("ai4bharat/IndicLID.py"):
-    print("Setting up models...")
-    exec(open("setup_models.py").read())
-# Now import everything
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from IndicTransToolkit.processor import IndicProcessor
-# Add current directory to path
-sys.path.append(os.getcwd())
-# Import IndicLID
 try:
     from ai4bharat.IndicLID import IndicLID
     INDICLID_AVAILABLE = True
 except ImportError as e:
-    print(f"IndicLID import failed: {e}")
     INDICLID_AVAILABLE = False
 # Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Language mapping
 LID_TO_TRANS2_MAPPING = {
     'hindi': 'hin_Deva',
     'bengali': 'ben_Beng',
@@ -44,39 +56,28 @@ LID_TO_TRANS2_MAPPING = {
     'assamese': 'asm_Beng',
     'kashmiri': 'kas_Arab',
     'sindhi': 'snd_Arab',
-    'sanskrit': 'san_Deva'
-}
-# Manual language options for fallback
-MANUAL_LANGUAGES = {
-    "Auto-detect": None,
-    "Hindi": "hin_Deva",
-    "Bengali": "ben_Beng",
-    "Tamil": "tam_Taml",
-    "Telugu": "tel_Telu",
-    "Gujarati": "guj_Gujr",
-    "Kannada": "kan_Knda",
-    "Malayalam": "mal_Mlym",
-    "Marathi": "mar_Deva",
-    "Punjabi": "pan_Guru",
-    "Urdu": "urd_Arab"
 }
-# Global variables
 lid_model = None
 translation_model = None
 tokenizer = None
 ip = None
-model_loading_status = "Not loaded"
 def load_models():
-    global lid_model, translation_model, tokenizer, ip, model_loading_status
     try:
-        model_loading_status = "Loading IndicTrans2..."
-        # Load IndicTrans2 first (more reliable)
-        print("Loading IndicTrans2...")
         model_name = "ai4bharat/indictrans2-indic-en-1B"
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         translation_model = AutoModelForSeq2SeqLM.from_pretrained(
@@ -86,197 +87,221 @@ def load_models():
         ).to(device)
         ip = IndicProcessor(inference=True)
-        print("✅ IndicTrans2 loaded successfully")
-        # Try to load IndicLID
-        if INDICLID_AVAILABLE:
-            model_loading_status = "Loading IndicLID..."
-            print("Loading IndicLID...")
-            lid_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
-            print("✅ IndicLID loaded successfully")
-            model_loading_status = "✅ All models loaded!"
-        else:
-            model_loading_status = "✅ IndicTrans2 loaded (manual language selection)"
-        return model_loading_status
     except Exception as e:
-        model_loading_status = f"❌ Error: {str(e)}"
-        return model_loading_status
-def translate_text(input_text, source_language="Auto-detect"):
-    global lid_model, translation_model, tokenizer, ip
-    if not translation_model:
-        return "❌ Translation model not loaded. Please wait...", "", 0.0
     if not input_text.strip():
-        return "Please enter text to translate.", "", 0.0
     try:
-        detected_lang = "unknown"
-        confidence = 0.0
-        src_lang_code = None
-        # Language identification
-        if source_language == "Auto-detect" and lid_model:
-            # Use IndicLID
-            lid_result = lid_model.batch_predict([input_text])
-            detected_lang = lid_result[0]['langinfo']['text_lang']
-            confidence = lid_result[0]['langinfo']['text_lang_score']
             if detected_lang in LID_TO_TRANS2_MAPPING:
                 src_lang_code = LID_TO_TRANS2_MAPPING[detected_lang]
-            else:
-                return f"Detected language '{detected_lang}' not supported", detected_lang.title(), confidence
-        elif source_language != "Auto-detect":
-            # Manual language selection
-            src_lang_code = MANUAL_LANGUAGES[source_language]
-            detected_lang = source_language.lower()
-            confidence = 1.0
-        else:
-            return "❌ Please select a source language (IndicLID not available)", "", 0.0
-        if not src_lang_code:
-            return "❌ Could not determine source language", detected_lang, confidence
-        # Skip if already English
-        if detected_lang == 'english':
-            return input_text, "English", confidence
-        # Translation
-        target_lang_code = "eng_Latn"
-        # Preprocess
-        batch = ip.preprocess_batch(
-            [input_text],
-            src_lang=src_lang_code,
-            tgt_lang=target_lang_code
-        )
-        # Tokenize
-        inputs = tokenizer(
-            batch,
-            truncation=True,
-            padding="longest",
-            return_tensors="pt",
-            return_attention_mask=True
-        ).to(device)
-        # Generate
-        with torch.no_grad():
-            generated_tokens = translation_model.generate(
-                **inputs,
-                use_cache=True,
-                min_length=0,
-                max_length=256,
-                num_beams=5,
-                num_return_sequences=1
-            )
-        # Decode
-        decoded = tokenizer.batch_decode(
-            generated_tokens,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=True
-        )
-        # Postprocess
-        translations = ip.postprocess_batch(decoded, lang=target_lang_code)
-        translation = translations[0]
         return translation, detected_lang.title(), confidence
     except Exception as e:
-        return f"❌ Error: {str(e)}", "", 0.0
-# Create interface
 def create_interface():
-    with gr.Blocks(title="Indic Language Translator", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
-        # 🌍 Indic Language Translator
-        **Powered by IndicLID + IndicTrans2**
-        Translate Indian languages to English with automatic language detection.
         """)
         # Status display
         status_display = gr.Textbox(
-            value="Loading models...",
-            label="Status",
-            interactive=False
         )
         with gr.Row():
-            with gr.Column(scale=2):
                 input_text = gr.Textbox(
-                    label="Input Text",
-                    placeholder="Enter text in any Indian language...",
-                    lines=5
                 )
-                source_lang = gr.Dropdown(
-                    choices=list(MANUAL_LANGUAGES.keys()),
-                    value="Auto-detect",
-                    label="Source Language"
                 )
-                translate_btn = gr.Button("🔄 Translate", variant="primary", size="lg")
-            with gr.Column(scale=2):
-                output_text = gr.Textbox(
-                    label="English Translation",
-                    lines=5,
-                    interactive=False
                 )
                 with gr.Row():
-                    detected_lang = gr.Textbox(
-                        label="Detected Language",
                         interactive=False,
-                        scale=2
                     )
-                    confidence = gr.Number(
-                        label="Confidence",
                         interactive=False,
-                        scale=1
                     )
-        # Examples
         gr.Examples(
             examples=[
-                ["मैं आज बाजार जा रहा हूं।", "Auto-detect"],
-                ["আমি আজ বাজারে যাচ্ছি।", "Bengali"],
-                ["நான் இன்று சந்தைக்கு போகிறேன்।", "Tamil"],
             ],
-            inputs=[input_text, source_lang],
         )
-        # Event handlers
         translate_btn.click(
-            fn=translate_text,
-            inputs=[input_text, source_lang],
-            outputs=[output_text, detected_lang, confidence]
         )
         input_text.submit(
-            fn=translate_text,
-            inputs=[input_text, source_lang],
-            outputs=[output_text, detected_lang, confidence]
         )
-        # Load models and update status
-        def update_status():
-            status = load_models()
-            return status
-        demo.load(update_status, outputs=[status_display])
     return demo
 if __name__ == "__main__":
     demo = create_interface()
-    demo.launch()

 import sys
 import os
 from pathlib import Path
+import warnings
+warnings.filterwarnings("ignore")
+# Setup IndicLID if not already done
 if not os.path.exists("ai4bharat/IndicLID.py"):
+    print("🚀 Setting up IndicLID for the first time...")
+    exec(open("setup_indiclid.py").read())
+# Import torch safe globals first
+try:
+    exec(open("torch_safe_globals.py").read())
+    print("✅ Torch safe globals loaded")
+except:
+    print("⚠️ Could not load torch safe globals")
+# Add current directory to Python path
+sys.path.insert(0, os.getcwd())
+# Import required libraries
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from IndicTransToolkit.processor import IndicProcessor
+# Import IndicLID - This is crucial for automatic language detection
 try:
     from ai4bharat.IndicLID import IndicLID
     INDICLID_AVAILABLE = True
+    print("✅ IndicLID imported successfully - Automatic language detection enabled")
 except ImportError as e:
+    print(f"❌ IndicLID import failed: {e}")
     INDICLID_AVAILABLE = False
+    raise Exception("IndicLID is required for automatic language detection!")
 # Device setup
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"🔧 Using device: {device}")
+# Language mapping from IndicLID output to IndicTrans2 codes
 LID_TO_TRANS2_MAPPING = {
     'hindi': 'hin_Deva',
     'bengali': 'ben_Beng',
     'assamese': 'asm_Beng',
     'kashmiri': 'kas_Arab',
     'sindhi': 'snd_Arab',
+    'sanskrit': 'san_Deva',
+    'english': 'eng_Latn'
 }
+# Global model variables
 lid_model = None
 translation_model = None
 tokenizer = None
 ip = None
 def load_models():
+    """Load both IndicLID (for detection) and IndicTrans2 (for translation)"""
+    global lid_model, translation_model, tokenizer, ip
     try:
+        # Step 1: Load IndicLID for automatic language detection
+        print("🔍 Loading IndicLID for automatic language detection...")
+        lid_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
+        print("✅ IndicLID loaded successfully - Ready for automatic language detection!")
+        # Step 2: Load IndicTrans2 for translation
+        print("🔄 Loading IndicTrans2 for translation...")
         model_name = "ai4bharat/indictrans2-indic-en-1B"
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         translation_model = AutoModelForSeq2SeqLM.from_pretrained(
         ).to(device)
         ip = IndicProcessor(inference=True)
+        print("✅ IndicTrans2 loaded successfully - Ready for translation!")
+        return "✅ Both models loaded successfully!\n🔍 IndicLID: Automatic language detection\n🔄 IndicTrans2: Translation to English"
     except Exception as e:
+        error_msg = f"❌ Error loading models: {str(e)}"
+        print(error_msg)
+        return error_msg
+def automatic_detect_and_translate(input_text):
+    """
+    Main function: Automatic language detection using IndicLID + Translation using IndicTrans2
+    This is the core pipeline you requested
+    """
+    if not all([lid_model, translation_model, tokenizer, ip]):
+        return "❌ Models not loaded. Please wait for initialization.", "", 0.0
     if not input_text.strip():
+        return "Please enter text for automatic detection and translation.", "", 0.0
     try:
+        # STEP 1: AUTOMATIC LANGUAGE DETECTION USING INDICLID
+        print(f"🔍 Detecting language for: {input_text[:50]}...")
+        lid_result = lid_model.batch_predict([input_text])
+        # Extract language detection results
+        detected_lang = lid_result[0]['langinfo']['text_lang']
+        confidence = lid_result[0]['langinfo']['text_lang_score']
+        print(f"✅ IndicLID detected: {detected_lang} (confidence: {confidence:.3f})")
+        # STEP 2: TRANSLATION USING INDICTRANS2 (if not English)
+        if detected_lang.lower() == 'english':
+            translation = input_text
+            print("ℹ️ Text is already in English, no translation needed")
+        else:
+            # Check if detected language is supported by IndicTrans2
             if detected_lang in LID_TO_TRANS2_MAPPING:
                 src_lang_code = LID_TO_TRANS2_MAPPING[detected_lang]
+                target_lang_code = "eng_Latn"
+                print(f"🔄 Translating from {src_lang_code} to {target_lang_code}...")
+                # Preprocess for IndicTrans2
+                batch = ip.preprocess_batch(
+                    [input_text],
+                    src_lang=src_lang_code,
+                    tgt_lang=target_lang_code
+                )
+                # Tokenize
+                inputs = tokenizer(
+                    batch,
+                    truncation=True,
+                    padding="longest",
+                    return_tensors="pt",
+                    return_attention_mask=True
+                ).to(device)
+                # Generate translation
+                with torch.no_grad():
+                    generated_tokens = translation_model.generate(
+                        **inputs,
+                        use_cache=True,
+                        min_length=0,
+                        max_length=256,
+                        num_beams=5,
+                        num_return_sequences=1
+                    )
+                # Decode translation
+                decoded = tokenizer.batch_decode(
+                    generated_tokens,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=True
+                )
+                # Postprocess
+                translations = ip.postprocess_batch(decoded, lang=target_lang_code)
+                translation = translations[0]
+                print(f"✅ Translation completed: {translation}")
+            else:
+                translation = f"❌ Language '{detected_lang}' not supported for translation"
+                print(f"⚠️ {translation}")
         return translation, detected_lang.title(), confidence
     except Exception as e:
+        error_msg = f"❌ Error in detection/translation pipeline: {str(e)}"
+        print(error_msg)
+        return error_msg, "", 0.0
 def create_interface():
+    """Create Gradio interface focused on automatic IndicLID detection + IndicTrans2 translation"""
+    with gr.Blocks(
+        title="IndicLID → IndicTrans2 Pipeline",
+        theme=gr.themes.Soft()
+    ) as demo:
         gr.Markdown("""
+        # 🔍➡️🔄 Automatic Language Detection + Translation
+        **Complete Pipeline: IndicLID → IndicTrans2**
+        1. **🔍 IndicLID**: Automatically detects your input language
+        2. **🔄 IndicTrans2**: Translates to English based on detected language
+        **No manual language selection needed!** Just paste your text and get automatic detection + translation.
         """)
         # Status display
         status_display = gr.Textbox(
+            value="🚀 Loading IndicLID and IndicTrans2 models...",
+            label="🔧 Pipeline Status",
+            interactive=False,
+            lines=3
         )
         with gr.Row():
+            with gr.Column(scale=1):
                 input_text = gr.Textbox(
+                    label="📝 Input Text (Any Indian Language)",
+                    placeholder="Enter text in Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Urdu, etc...\n\nIndicLID will automatically detect the language!",
+                    lines=6,
+                    max_lines=10
                 )
+                translate_btn = gr.Button(
+                    "🔍➡️🔄 Auto-Detect & Translate",
+                    variant="primary",
+                    size="lg"
                 )
+            with gr.Column(scale=1):
+                translation_output = gr.Textbox(
+                    label="🇬🇧 English Translation",
+                    lines=6,
+                    max_lines=10,
+                    interactive=False,
+                    placeholder="Automatic translation will appear here..."
                 )
                 with gr.Row():
+                    detected_language = gr.Textbox(
+                        label="🌐 Auto-Detected Language",
                         interactive=False,
+                        scale=2,
+                        placeholder="Language will be detected automatically"
                     )
+                    confidence_score = gr.Number(
+                        label="📊 Detection Confidence",
                         interactive=False,
+                        scale=1,
+                        precision=3
                     )
+        # Examples showcasing automatic detection
+        gr.Markdown("### 📖 Try These Examples (Automatic Detection!):")
         gr.Examples(
             examples=[
+                ["मैं आज बाजार जा रहा हूं।"],  # Hindi
+                ["আমি আজ বাজারে যাচ্ছি।"],         # Bengali
+                ["நான் இன்று சந்தைக்கு போகிறேன்।"],  # Tamil
+                ["ನಾನು ಇಂದು ಮಾರುಕಟ್ಟೆಗೆ ಹೋಗುತ್ತಿದ್ದೇನೆ।"], # Kannada
+                ["હું આજે બજારમાં જાઉં છું।"],        # Gujarati
+                ["मी आज बाजारात जात आहे।"],         # Marathi
+                ["میں آج بازار جا رہا ہوں۔"],        # Urdu
+                ["ਮੈਂ ਅੱਜ ਬਾਜ਼ਾਰ ਜਾ ਰਿਹਾ ਹਾਂ।"],      # Punjabi
+                ["నేను ఈరోజు మార్కెట్‌కి వెళ్తున్నాను।"], # Telugu
+                ["ഞാൻ ഇന്ന് മാർക്കറ്റിൽ പോകുന്നു।"]   # Malayalam
             ],
+            inputs=[input_text],
+            label="Click any example to test automatic detection!"
         )
+        # Information about supported languages
+        gr.Markdown("""
+        ### 🌐 Supported Languages for Auto-Detection:
+        **IndicLID can automatically detect:** Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi,
+        Punjabi, Urdu, Odia, Assamese, Nepali, Kashmiri, Sindhi, Sanskrit, and English.
+        ### ✨ How it works:
+        1. You paste text in **any** supported Indian language
+        2. **IndicLID** automatically identifies the language (no manual selection!)
+        3. **IndicTrans2** translates it to English based on the detected language
+        """)
+        # Event handlers for automatic detection + translation
         translate_btn.click(
+            fn=automatic_detect_and_translate,
+            inputs=[input_text],
+            outputs=[translation_output, detected_language, confidence_score]
         )
+        # Auto-submit on Enter key
         input_text.submit(
+            fn=automatic_detect_and_translate,
+            inputs=[input_text],
+            outputs=[translation_output, detected_language, confidence_score]
         )
+        # Load models on startup
+        demo.load(load_models, outputs=[status_display])
     return demo
 if __name__ == "__main__":
+    print("🚀 Starting IndicLID → IndicTrans2 Automatic Pipeline")
+    print("🔍 IndicLID will handle automatic language detection")
+    print("🔄 IndicTrans2 will handle translation to English")
     demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )