import gradio as gr import torch import sys import os from pathlib import Path import warnings warnings.filterwarnings("ignore") # Setup IndicLID if not already done if not os.path.exists("ai4bharat/IndicLID.py"): print("🚀 Setting up IndicLID for the first time...") exec(open("setup_indiclid.py").read()) # Import torch safe globals first try: exec(open("torch_safe_globals.py").read()) print("✅ Torch safe globals loaded") except: print("⚠️ Could not load torch safe globals") # Add current directory to Python path sys.path.insert(0, os.getcwd()) # Import required libraries from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from IndicTransToolkit.processor import IndicProcessor # Import IndicLID - This is crucial for automatic language detection try: from ai4bharat.IndicLID import IndicLID INDICLID_AVAILABLE = True print("✅ IndicLID imported successfully - Automatic language detection enabled") except ImportError as e: print(f"❌ IndicLID import failed: {e}") INDICLID_AVAILABLE = False raise Exception("IndicLID is required for automatic language detection!") # Device setup device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"🔧 Using device: {device}") # Language mapping from IndicLID output to IndicTrans2 codes LID_TO_TRANS2_MAPPING = { 'hindi': 'hin_Deva', 'bengali': 'ben_Beng', 'gujarati': 'guj_Gujr', 'kannada': 'kan_Knda', 'malayalam': 'mal_Mlym', 'marathi': 'mar_Deva', 'nepali': 'npi_Deva', 'odia': 'ory_Orya', 'punjabi': 'pan_Guru', 'tamil': 'tam_Taml', 'telugu': 'tel_Telu', 'urdu': 'urd_Arab', 'assamese': 'asm_Beng', 'kashmiri': 'kas_Arab', 'sindhi': 'snd_Arab', 'sanskrit': 'san_Deva', 'english': 'eng_Latn' } # Global model variables lid_model = None translation_model = None tokenizer = None ip = None def load_models(): """Load both IndicLID (for detection) and IndicTrans2 (for translation)""" global lid_model, translation_model, tokenizer, ip try: # Step 1: Load IndicLID for automatic language detection print("🔍 Loading IndicLID for automatic language detection...") lid_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6) print("✅ IndicLID loaded successfully - Ready for automatic language detection!") # Step 2: Load IndicTrans2 for translation print("🔄 Loading IndicTrans2 for translation...") model_name = "ai4bharat/indictrans2-indic-en-1B" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) translation_model = AutoModelForSeq2SeqLM.from_pretrained( model_name, trust_remote_code=True, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ).to(device) ip = IndicProcessor(inference=True) print("✅ IndicTrans2 loaded successfully - Ready for translation!") return "✅ Both models loaded successfully!\n🔍 IndicLID: Automatic language detection\n🔄 IndicTrans2: Translation to English" except Exception as e: error_msg = f"❌ Error loading models: {str(e)}" print(error_msg) return error_msg def automatic_detect_and_translate(input_text): """ Main function: Automatic language detection using IndicLID + Translation using IndicTrans2 This is the core pipeline you requested """ if not all([lid_model, translation_model, tokenizer, ip]): return "❌ Models not loaded. Please wait for initialization.", "", 0.0 if not input_text.strip(): return "Please enter text for automatic detection and translation.", "", 0.0 try: # STEP 1: AUTOMATIC LANGUAGE DETECTION USING INDICLID print(f"🔍 Detecting language for: {input_text[:50]}...") lid_result = lid_model.batch_predict([input_text]) # Extract language detection results detected_lang = lid_result[0]['langinfo']['text_lang'] confidence = lid_result[0]['langinfo']['text_lang_score'] print(f"✅ IndicLID detected: {detected_lang} (confidence: {confidence:.3f})") # STEP 2: TRANSLATION USING INDICTRANS2 (if not English) if detected_lang.lower() == 'english': translation = input_text print("ℹ️ Text is already in English, no translation needed") else: # Check if detected language is supported by IndicTrans2 if detected_lang in LID_TO_TRANS2_MAPPING: src_lang_code = LID_TO_TRANS2_MAPPING[detected_lang] target_lang_code = "eng_Latn" print(f"🔄 Translating from {src_lang_code} to {target_lang_code}...") # Preprocess for IndicTrans2 batch = ip.preprocess_batch( [input_text], src_lang=src_lang_code, tgt_lang=target_lang_code ) # Tokenize inputs = tokenizer( batch, truncation=True, padding="longest", return_tensors="pt", return_attention_mask=True ).to(device) # Generate translation with torch.no_grad(): generated_tokens = translation_model.generate( **inputs, use_cache=True, min_length=0, max_length=256, num_beams=5, num_return_sequences=1 ) # Decode translation decoded = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True ) # Postprocess translations = ip.postprocess_batch(decoded, lang=target_lang_code) translation = translations[0] print(f"✅ Translation completed: {translation}") else: translation = f"❌ Language '{detected_lang}' not supported for translation" print(f"⚠️ {translation}") return translation, detected_lang.title(), confidence except Exception as e: error_msg = f"❌ Error in detection/translation pipeline: {str(e)}" print(error_msg) return error_msg, "", 0.0 def create_interface(): """Create Gradio interface focused on automatic IndicLID detection + IndicTrans2 translation""" with gr.Blocks( title="IndicLID → IndicTrans2 Pipeline", theme=gr.themes.Soft() ) as demo: gr.Markdown(""" # 🔍➡️🔄 Automatic Language Detection + Translation **Complete Pipeline: IndicLID → IndicTrans2** 1. **🔍 IndicLID**: Automatically detects your input language 2. **🔄 IndicTrans2**: Translates to English based on detected language **No manual language selection needed!** Just paste your text and get automatic detection + translation. """) # Status display status_display = gr.Textbox( value="🚀 Loading IndicLID and IndicTrans2 models...", label="🔧 Pipeline Status", interactive=False, lines=3 ) with gr.Row(): with gr.Column(scale=1): input_text = gr.Textbox( label="📝 Input Text (Any Indian Language)", placeholder="Enter text in Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Urdu, etc...\n\nIndicLID will automatically detect the language!", lines=6, max_lines=10 ) translate_btn = gr.Button( "🔍➡️🔄 Auto-Detect & Translate", variant="primary", size="lg" ) with gr.Column(scale=1): translation_output = gr.Textbox( label="🇬🇧 English Translation", lines=6, max_lines=10, interactive=False, placeholder="Automatic translation will appear here..." ) with gr.Row(): detected_language = gr.Textbox( label="🌐 Auto-Detected Language", interactive=False, scale=2, placeholder="Language will be detected automatically" ) confidence_score = gr.Number( label="📊 Detection Confidence", interactive=False, scale=1, precision=3 ) # Examples showcasing automatic detection gr.Markdown("### 📖 Try These Examples (Automatic Detection!):") gr.Examples( examples=[ ["मैं आज बाजार जा रहा हूं।"], # Hindi ["আমি আজ বাজারে যাচ্ছি।"], # Bengali ["நான் இன்று சந்தைக்கு போகிறேன்।"], # Tamil ["ನಾನು ಇಂದು ಮಾರುಕಟ್ಟೆಗೆ ಹೋಗುತ್ತಿದ್ದೇನೆ।"], # Kannada ["હું આજે બજારમાં જાઉં છું।"], # Gujarati ["मी आज बाजारात जात आहे।"], # Marathi ["میں آج بازار جا رہا ہوں۔"], # Urdu ["ਮੈਂ ਅੱਜ ਬਾਜ਼ਾਰ ਜਾ ਰਿਹਾ ਹਾਂ।"], # Punjabi ["నేను ఈరోజు మార్కెట్‌కి వెళ్తున్నాను।"], # Telugu ["ഞാൻ ഇന്ന് മാർക്കറ്റിൽ പോകുന്നു।"] # Malayalam ], inputs=[input_text], label="Click any example to test automatic detection!" ) # Information about supported languages gr.Markdown(""" ### 🌐 Supported Languages for Auto-Detection: **IndicLID can automatically detect:** Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Urdu, Odia, Assamese, Nepali, Kashmiri, Sindhi, Sanskrit, and English. ### ✨ How it works: 1. You paste text in **any** supported Indian language 2. **IndicLID** automatically identifies the language (no manual selection!) 3. **IndicTrans2** translates it to English based on the detected language """) # Event handlers for automatic detection + translation translate_btn.click( fn=automatic_detect_and_translate, inputs=[input_text], outputs=[translation_output, detected_language, confidence_score] ) # Auto-submit on Enter key input_text.submit( fn=automatic_detect_and_translate, inputs=[input_text], outputs=[translation_output, detected_language, confidence_score] ) # Load models on startup demo.load(load_models, outputs=[status_display]) return demo if __name__ == "__main__": print("🚀 Starting IndicLID → IndicTrans2 Automatic Pipeline") print("🔍 IndicLID will handle automatic language detection") print("🔄 IndicTrans2 will handle translation to English") demo = create_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=True )