Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import sys | |
| import os | |
| from pathlib import Path | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # Setup IndicLID if not already done | |
| if not os.path.exists("ai4bharat/IndicLID.py"): | |
| print("🚀 Setting up IndicLID for the first time...") | |
| exec(open("setup_indiclid.py").read()) | |
| # Import torch safe globals first | |
| try: | |
| exec(open("torch_safe_globals.py").read()) | |
| print("✅ Torch safe globals loaded") | |
| except: | |
| print("⚠️ Could not load torch safe globals") | |
| # Add current directory to Python path | |
| sys.path.insert(0, os.getcwd()) | |
| # Import required libraries | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| from IndicTransToolkit.processor import IndicProcessor | |
| # Import IndicLID - This is crucial for automatic language detection | |
| try: | |
| from ai4bharat.IndicLID import IndicLID | |
| INDICLID_AVAILABLE = True | |
| print("✅ IndicLID imported successfully - Automatic language detection enabled") | |
| except ImportError as e: | |
| print(f"❌ IndicLID import failed: {e}") | |
| INDICLID_AVAILABLE = False | |
| raise Exception("IndicLID is required for automatic language detection!") | |
| # Device setup | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"🔧 Using device: {device}") | |
| # Language mapping from IndicLID output to IndicTrans2 codes | |
| LID_TO_TRANS2_MAPPING = { | |
| 'hindi': 'hin_Deva', | |
| 'bengali': 'ben_Beng', | |
| 'gujarati': 'guj_Gujr', | |
| 'kannada': 'kan_Knda', | |
| 'malayalam': 'mal_Mlym', | |
| 'marathi': 'mar_Deva', | |
| 'nepali': 'npi_Deva', | |
| 'odia': 'ory_Orya', | |
| 'punjabi': 'pan_Guru', | |
| 'tamil': 'tam_Taml', | |
| 'telugu': 'tel_Telu', | |
| 'urdu': 'urd_Arab', | |
| 'assamese': 'asm_Beng', | |
| 'kashmiri': 'kas_Arab', | |
| 'sindhi': 'snd_Arab', | |
| 'sanskrit': 'san_Deva', | |
| 'english': 'eng_Latn' | |
| } | |
| # Global model variables | |
| lid_model = None | |
| translation_model = None | |
| tokenizer = None | |
| ip = None | |
| def load_models(): | |
| """Load both IndicLID (for detection) and IndicTrans2 (for translation)""" | |
| global lid_model, translation_model, tokenizer, ip | |
| try: | |
| # Step 1: Load IndicLID for automatic language detection | |
| print("🔍 Loading IndicLID for automatic language detection...") | |
| lid_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6) | |
| print("✅ IndicLID loaded successfully - Ready for automatic language detection!") | |
| # Step 2: Load IndicTrans2 for translation | |
| print("🔄 Loading IndicTrans2 for translation...") | |
| model_name = "ai4bharat/indictrans2-indic-en-1B" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) | |
| translation_model = AutoModelForSeq2SeqLM.from_pretrained( | |
| model_name, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 | |
| ).to(device) | |
| ip = IndicProcessor(inference=True) | |
| print("✅ IndicTrans2 loaded successfully - Ready for translation!") | |
| return "✅ Both models loaded successfully!\n🔍 IndicLID: Automatic language detection\n🔄 IndicTrans2: Translation to English" | |
| except Exception as e: | |
| error_msg = f"❌ Error loading models: {str(e)}" | |
| print(error_msg) | |
| return error_msg | |
| def automatic_detect_and_translate(input_text): | |
| """ | |
| Main function: Automatic language detection using IndicLID + Translation using IndicTrans2 | |
| This is the core pipeline you requested | |
| """ | |
| if not all([lid_model, translation_model, tokenizer, ip]): | |
| return "❌ Models not loaded. Please wait for initialization.", "", 0.0 | |
| if not input_text.strip(): | |
| return "Please enter text for automatic detection and translation.", "", 0.0 | |
| try: | |
| # STEP 1: AUTOMATIC LANGUAGE DETECTION USING INDICLID | |
| print(f"🔍 Detecting language for: {input_text[:50]}...") | |
| lid_result = lid_model.batch_predict([input_text]) | |
| # Extract language detection results | |
| detected_lang = lid_result[0]['langinfo']['text_lang'] | |
| confidence = lid_result[0]['langinfo']['text_lang_score'] | |
| print(f"✅ IndicLID detected: {detected_lang} (confidence: {confidence:.3f})") | |
| # STEP 2: TRANSLATION USING INDICTRANS2 (if not English) | |
| if detected_lang.lower() == 'english': | |
| translation = input_text | |
| print("ℹ️ Text is already in English, no translation needed") | |
| else: | |
| # Check if detected language is supported by IndicTrans2 | |
| if detected_lang in LID_TO_TRANS2_MAPPING: | |
| src_lang_code = LID_TO_TRANS2_MAPPING[detected_lang] | |
| target_lang_code = "eng_Latn" | |
| print(f"🔄 Translating from {src_lang_code} to {target_lang_code}...") | |
| # Preprocess for IndicTrans2 | |
| batch = ip.preprocess_batch( | |
| [input_text], | |
| src_lang=src_lang_code, | |
| tgt_lang=target_lang_code | |
| ) | |
| # Tokenize | |
| inputs = tokenizer( | |
| batch, | |
| truncation=True, | |
| padding="longest", | |
| return_tensors="pt", | |
| return_attention_mask=True | |
| ).to(device) | |
| # Generate translation | |
| with torch.no_grad(): | |
| generated_tokens = translation_model.generate( | |
| **inputs, | |
| use_cache=True, | |
| min_length=0, | |
| max_length=256, | |
| num_beams=5, | |
| num_return_sequences=1 | |
| ) | |
| # Decode translation | |
| decoded = tokenizer.batch_decode( | |
| generated_tokens, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True | |
| ) | |
| # Postprocess | |
| translations = ip.postprocess_batch(decoded, lang=target_lang_code) | |
| translation = translations[0] | |
| print(f"✅ Translation completed: {translation}") | |
| else: | |
| translation = f"❌ Language '{detected_lang}' not supported for translation" | |
| print(f"⚠️ {translation}") | |
| return translation, detected_lang.title(), confidence | |
| except Exception as e: | |
| error_msg = f"❌ Error in detection/translation pipeline: {str(e)}" | |
| print(error_msg) | |
| return error_msg, "", 0.0 | |
| def create_interface(): | |
| """Create Gradio interface focused on automatic IndicLID detection + IndicTrans2 translation""" | |
| with gr.Blocks( | |
| title="IndicLID → IndicTrans2 Pipeline", | |
| theme=gr.themes.Soft() | |
| ) as demo: | |
| gr.Markdown(""" | |
| # 🔍➡️🔄 Automatic Language Detection + Translation | |
| **Complete Pipeline: IndicLID → IndicTrans2** | |
| 1. **🔍 IndicLID**: Automatically detects your input language | |
| 2. **🔄 IndicTrans2**: Translates to English based on detected language | |
| **No manual language selection needed!** Just paste your text and get automatic detection + translation. | |
| """) | |
| # Status display | |
| status_display = gr.Textbox( | |
| value="🚀 Loading IndicLID and IndicTrans2 models...", | |
| label="🔧 Pipeline Status", | |
| interactive=False, | |
| lines=3 | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_text = gr.Textbox( | |
| label="📝 Input Text (Any Indian Language)", | |
| placeholder="Enter text in Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Urdu, etc...\n\nIndicLID will automatically detect the language!", | |
| lines=6, | |
| max_lines=10 | |
| ) | |
| translate_btn = gr.Button( | |
| "🔍➡️🔄 Auto-Detect & Translate", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=1): | |
| translation_output = gr.Textbox( | |
| label="🇬🇧 English Translation", | |
| lines=6, | |
| max_lines=10, | |
| interactive=False, | |
| placeholder="Automatic translation will appear here..." | |
| ) | |
| with gr.Row(): | |
| detected_language = gr.Textbox( | |
| label="🌐 Auto-Detected Language", | |
| interactive=False, | |
| scale=2, | |
| placeholder="Language will be detected automatically" | |
| ) | |
| confidence_score = gr.Number( | |
| label="📊 Detection Confidence", | |
| interactive=False, | |
| scale=1, | |
| precision=3 | |
| ) | |
| # Examples showcasing automatic detection | |
| gr.Markdown("### 📖 Try These Examples (Automatic Detection!):") | |
| gr.Examples( | |
| examples=[ | |
| ["मैं आज बाजार जा रहा हूं।"], # Hindi | |
| ["আমি আজ বাজারে যাচ্ছি।"], # Bengali | |
| ["நான் இன்று சந்தைக்கு போகிறேன்।"], # Tamil | |
| ["ನಾನು ಇಂದು ಮಾರುಕಟ್ಟೆಗೆ ಹೋಗುತ್ತಿದ್ದೇನೆ।"], # Kannada | |
| ["હું આજે બજારમાં જાઉં છું।"], # Gujarati | |
| ["मी आज बाजारात जात आहे।"], # Marathi | |
| ["میں آج بازار جا رہا ہوں۔"], # Urdu | |
| ["ਮੈਂ ਅੱਜ ਬਾਜ਼ਾਰ ਜਾ ਰਿਹਾ ਹਾਂ।"], # Punjabi | |
| ["నేను ఈరోజు మార్కెట్కి వెళ్తున్నాను।"], # Telugu | |
| ["ഞാൻ ഇന്ന് മാർക്കറ്റിൽ പോകുന്നു।"] # Malayalam | |
| ], | |
| inputs=[input_text], | |
| label="Click any example to test automatic detection!" | |
| ) | |
| # Information about supported languages | |
| gr.Markdown(""" | |
| ### 🌐 Supported Languages for Auto-Detection: | |
| **IndicLID can automatically detect:** Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi, | |
| Punjabi, Urdu, Odia, Assamese, Nepali, Kashmiri, Sindhi, Sanskrit, and English. | |
| ### ✨ How it works: | |
| 1. You paste text in **any** supported Indian language | |
| 2. **IndicLID** automatically identifies the language (no manual selection!) | |
| 3. **IndicTrans2** translates it to English based on the detected language | |
| """) | |
| # Event handlers for automatic detection + translation | |
| translate_btn.click( | |
| fn=automatic_detect_and_translate, | |
| inputs=[input_text], | |
| outputs=[translation_output, detected_language, confidence_score] | |
| ) | |
| # Auto-submit on Enter key | |
| input_text.submit( | |
| fn=automatic_detect_and_translate, | |
| inputs=[input_text], | |
| outputs=[translation_output, detected_language, confidence_score] | |
| ) | |
| # Load models on startup | |
| demo.load(load_models, outputs=[status_display]) | |
| return demo | |
| if __name__ == "__main__": | |
| print("🚀 Starting IndicLID → IndicTrans2 Automatic Pipeline") | |
| print("🔍 IndicLID will handle automatic language detection") | |
| print("🔄 IndicTrans2 will handle translation to English") | |
| demo = create_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True | |
| ) | |