Noumida's picture
Update app.py
b7caabd verified
raw
history blame
12.3 kB
import gradio as gr
import torch
import sys
import os
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")
# Setup IndicLID if not already done
if not os.path.exists("ai4bharat/IndicLID.py"):
print("🚀 Setting up IndicLID for the first time...")
exec(open("setup_indiclid.py").read())
# Import torch safe globals first
try:
exec(open("torch_safe_globals.py").read())
print("✅ Torch safe globals loaded")
except:
print("⚠️ Could not load torch safe globals")
# Add current directory to Python path
sys.path.insert(0, os.getcwd())
# Import required libraries
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit.processor import IndicProcessor
# Import IndicLID - This is crucial for automatic language detection
try:
from ai4bharat.IndicLID import IndicLID
INDICLID_AVAILABLE = True
print("✅ IndicLID imported successfully - Automatic language detection enabled")
except ImportError as e:
print(f"❌ IndicLID import failed: {e}")
INDICLID_AVAILABLE = False
raise Exception("IndicLID is required for automatic language detection!")
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔧 Using device: {device}")
# Language mapping from IndicLID output to IndicTrans2 codes
LID_TO_TRANS2_MAPPING = {
'hindi': 'hin_Deva',
'bengali': 'ben_Beng',
'gujarati': 'guj_Gujr',
'kannada': 'kan_Knda',
'malayalam': 'mal_Mlym',
'marathi': 'mar_Deva',
'nepali': 'npi_Deva',
'odia': 'ory_Orya',
'punjabi': 'pan_Guru',
'tamil': 'tam_Taml',
'telugu': 'tel_Telu',
'urdu': 'urd_Arab',
'assamese': 'asm_Beng',
'kashmiri': 'kas_Arab',
'sindhi': 'snd_Arab',
'sanskrit': 'san_Deva',
'english': 'eng_Latn'
}
# Global model variables
lid_model = None
translation_model = None
tokenizer = None
ip = None
def load_models():
"""Load both IndicLID (for detection) and IndicTrans2 (for translation)"""
global lid_model, translation_model, tokenizer, ip
try:
# Step 1: Load IndicLID for automatic language detection
print("🔍 Loading IndicLID for automatic language detection...")
lid_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
print("✅ IndicLID loaded successfully - Ready for automatic language detection!")
# Step 2: Load IndicTrans2 for translation
print("🔄 Loading IndicTrans2 for translation...")
model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
translation_model = AutoModelForSeq2SeqLM.from_pretrained(
model_name,
trust_remote_code=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device)
ip = IndicProcessor(inference=True)
print("✅ IndicTrans2 loaded successfully - Ready for translation!")
return "✅ Both models loaded successfully!\n🔍 IndicLID: Automatic language detection\n🔄 IndicTrans2: Translation to English"
except Exception as e:
error_msg = f"❌ Error loading models: {str(e)}"
print(error_msg)
return error_msg
def automatic_detect_and_translate(input_text):
"""
Main function: Automatic language detection using IndicLID + Translation using IndicTrans2
This is the core pipeline you requested
"""
if not all([lid_model, translation_model, tokenizer, ip]):
return "❌ Models not loaded. Please wait for initialization.", "", 0.0
if not input_text.strip():
return "Please enter text for automatic detection and translation.", "", 0.0
try:
# STEP 1: AUTOMATIC LANGUAGE DETECTION USING INDICLID
print(f"🔍 Detecting language for: {input_text[:50]}...")
lid_result = lid_model.batch_predict([input_text])
# Extract language detection results
detected_lang = lid_result[0]['langinfo']['text_lang']
confidence = lid_result[0]['langinfo']['text_lang_score']
print(f"✅ IndicLID detected: {detected_lang} (confidence: {confidence:.3f})")
# STEP 2: TRANSLATION USING INDICTRANS2 (if not English)
if detected_lang.lower() == 'english':
translation = input_text
print("ℹ️ Text is already in English, no translation needed")
else:
# Check if detected language is supported by IndicTrans2
if detected_lang in LID_TO_TRANS2_MAPPING:
src_lang_code = LID_TO_TRANS2_MAPPING[detected_lang]
target_lang_code = "eng_Latn"
print(f"🔄 Translating from {src_lang_code} to {target_lang_code}...")
# Preprocess for IndicTrans2
batch = ip.preprocess_batch(
[input_text],
src_lang=src_lang_code,
tgt_lang=target_lang_code
)
# Tokenize
inputs = tokenizer(
batch,
truncation=True,
padding="longest",
return_tensors="pt",
return_attention_mask=True
).to(device)
# Generate translation
with torch.no_grad():
generated_tokens = translation_model.generate(
**inputs,
use_cache=True,
min_length=0,
max_length=256,
num_beams=5,
num_return_sequences=1
)
# Decode translation
decoded = tokenizer.batch_decode(
generated_tokens,
skip_special_tokens=True,
clean_up_tokenization_spaces=True
)
# Postprocess
translations = ip.postprocess_batch(decoded, lang=target_lang_code)
translation = translations[0]
print(f"✅ Translation completed: {translation}")
else:
translation = f"❌ Language '{detected_lang}' not supported for translation"
print(f"⚠️ {translation}")
return translation, detected_lang.title(), confidence
except Exception as e:
error_msg = f"❌ Error in detection/translation pipeline: {str(e)}"
print(error_msg)
return error_msg, "", 0.0
def create_interface():
"""Create Gradio interface focused on automatic IndicLID detection + IndicTrans2 translation"""
with gr.Blocks(
title="IndicLID → IndicTrans2 Pipeline",
theme=gr.themes.Soft()
) as demo:
gr.Markdown("""
# 🔍➡️🔄 Automatic Language Detection + Translation
**Complete Pipeline: IndicLID → IndicTrans2**
1. **🔍 IndicLID**: Automatically detects your input language
2. **🔄 IndicTrans2**: Translates to English based on detected language
**No manual language selection needed!** Just paste your text and get automatic detection + translation.
""")
# Status display
status_display = gr.Textbox(
value="🚀 Loading IndicLID and IndicTrans2 models...",
label="🔧 Pipeline Status",
interactive=False,
lines=3
)
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(
label="📝 Input Text (Any Indian Language)",
placeholder="Enter text in Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Urdu, etc...\n\nIndicLID will automatically detect the language!",
lines=6,
max_lines=10
)
translate_btn = gr.Button(
"🔍➡️🔄 Auto-Detect & Translate",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
translation_output = gr.Textbox(
label="🇬🇧 English Translation",
lines=6,
max_lines=10,
interactive=False,
placeholder="Automatic translation will appear here..."
)
with gr.Row():
detected_language = gr.Textbox(
label="🌐 Auto-Detected Language",
interactive=False,
scale=2,
placeholder="Language will be detected automatically"
)
confidence_score = gr.Number(
label="📊 Detection Confidence",
interactive=False,
scale=1,
precision=3
)
# Examples showcasing automatic detection
gr.Markdown("### 📖 Try These Examples (Automatic Detection!):")
gr.Examples(
examples=[
["मैं आज बाजार जा रहा हूं।"], # Hindi
["আমি আজ বাজারে যাচ্ছি।"], # Bengali
["நான் இன்று சந்தைக்கு போகிறேன்।"], # Tamil
["ನಾನು ಇಂದು ಮಾರುಕಟ್ಟೆಗೆ ಹೋಗುತ್ತಿದ್ದೇನೆ।"], # Kannada
["હું આજે બજારમાં જાઉં છું।"], # Gujarati
["मी आज बाजारात जात आहे।"], # Marathi
["میں آج بازار جا رہا ہوں۔"], # Urdu
["ਮੈਂ ਅੱਜ ਬਾਜ਼ਾਰ ਜਾ ਰਿਹਾ ਹਾਂ।"], # Punjabi
["నేను ఈరోజు మార్కెట్‌కి వెళ్తున్నాను।"], # Telugu
["ഞാൻ ഇന്ന് മാർക്കറ്റിൽ പോകുന്നു।"] # Malayalam
],
inputs=[input_text],
label="Click any example to test automatic detection!"
)
# Information about supported languages
gr.Markdown("""
### 🌐 Supported Languages for Auto-Detection:
**IndicLID can automatically detect:** Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi,
Punjabi, Urdu, Odia, Assamese, Nepali, Kashmiri, Sindhi, Sanskrit, and English.
### ✨ How it works:
1. You paste text in **any** supported Indian language
2. **IndicLID** automatically identifies the language (no manual selection!)
3. **IndicTrans2** translates it to English based on the detected language
""")
# Event handlers for automatic detection + translation
translate_btn.click(
fn=automatic_detect_and_translate,
inputs=[input_text],
outputs=[translation_output, detected_language, confidence_score]
)
# Auto-submit on Enter key
input_text.submit(
fn=automatic_detect_and_translate,
inputs=[input_text],
outputs=[translation_output, detected_language, confidence_score]
)
# Load models on startup
demo.load(load_models, outputs=[status_display])
return demo
if __name__ == "__main__":
print("🚀 Starting IndicLID → IndicTrans2 Automatic Pipeline")
print("🔍 IndicLID will handle automatic language detection")
print("🔄 IndicTrans2 will handle translation to English")
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True
)