Noumida commited on
Commit
b7caabd
·
verified ·
1 Parent(s): 680f904

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +194 -169
app.py CHANGED
@@ -3,31 +3,43 @@ import torch
3
  import sys
4
  import os
5
  from pathlib import Path
 
 
6
 
7
- # Run setup if models don't exist
8
  if not os.path.exists("ai4bharat/IndicLID.py"):
9
- print("Setting up models...")
10
- exec(open("setup_models.py").read())
11
 
12
- # Now import everything
 
 
 
 
 
 
 
 
 
 
13
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
14
  from IndicTransToolkit.processor import IndicProcessor
15
 
16
- # Add current directory to path
17
- sys.path.append(os.getcwd())
18
-
19
- # Import IndicLID
20
  try:
21
  from ai4bharat.IndicLID import IndicLID
22
  INDICLID_AVAILABLE = True
 
23
  except ImportError as e:
24
- print(f"IndicLID import failed: {e}")
25
  INDICLID_AVAILABLE = False
 
26
 
27
  # Device setup
28
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
29
 
30
- # Language mapping
31
  LID_TO_TRANS2_MAPPING = {
32
  'hindi': 'hin_Deva',
33
  'bengali': 'ben_Beng',
@@ -44,39 +56,28 @@ LID_TO_TRANS2_MAPPING = {
44
  'assamese': 'asm_Beng',
45
  'kashmiri': 'kas_Arab',
46
  'sindhi': 'snd_Arab',
47
- 'sanskrit': 'san_Deva'
48
- }
49
-
50
- # Manual language options for fallback
51
- MANUAL_LANGUAGES = {
52
- "Auto-detect": None,
53
- "Hindi": "hin_Deva",
54
- "Bengali": "ben_Beng",
55
- "Tamil": "tam_Taml",
56
- "Telugu": "tel_Telu",
57
- "Gujarati": "guj_Gujr",
58
- "Kannada": "kan_Knda",
59
- "Malayalam": "mal_Mlym",
60
- "Marathi": "mar_Deva",
61
- "Punjabi": "pan_Guru",
62
- "Urdu": "urd_Arab"
63
  }
64
 
65
- # Global variables
66
  lid_model = None
67
  translation_model = None
68
  tokenizer = None
69
  ip = None
70
- model_loading_status = "Not loaded"
71
 
72
  def load_models():
73
- global lid_model, translation_model, tokenizer, ip, model_loading_status
 
74
 
75
  try:
76
- model_loading_status = "Loading IndicTrans2..."
 
 
 
77
 
78
- # Load IndicTrans2 first (more reliable)
79
- print("Loading IndicTrans2...")
80
  model_name = "ai4bharat/indictrans2-indic-en-1B"
81
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
82
  translation_model = AutoModelForSeq2SeqLM.from_pretrained(
@@ -86,197 +87,221 @@ def load_models():
86
  ).to(device)
87
 
88
  ip = IndicProcessor(inference=True)
89
- print("✅ IndicTrans2 loaded successfully")
90
 
91
- # Try to load IndicLID
92
- if INDICLID_AVAILABLE:
93
- model_loading_status = "Loading IndicLID..."
94
- print("Loading IndicLID...")
95
- lid_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
96
- print("✅ IndicLID loaded successfully")
97
- model_loading_status = "✅ All models loaded!"
98
- else:
99
- model_loading_status = "✅ IndicTrans2 loaded (manual language selection)"
100
-
101
- return model_loading_status
102
 
103
  except Exception as e:
104
- model_loading_status = f"❌ Error: {str(e)}"
105
- return model_loading_status
 
106
 
107
- def translate_text(input_text, source_language="Auto-detect"):
108
- global lid_model, translation_model, tokenizer, ip
109
-
110
- if not translation_model:
111
- return "❌ Translation model not loaded. Please wait...", "", 0.0
 
 
112
 
113
  if not input_text.strip():
114
- return "Please enter text to translate.", "", 0.0
115
 
116
  try:
117
- detected_lang = "unknown"
118
- confidence = 0.0
119
- src_lang_code = None
 
 
 
 
 
 
120
 
121
- # Language identification
122
- if source_language == "Auto-detect" and lid_model:
123
- # Use IndicLID
124
- lid_result = lid_model.batch_predict([input_text])
125
- detected_lang = lid_result[0]['langinfo']['text_lang']
126
- confidence = lid_result[0]['langinfo']['text_lang_score']
127
-
128
  if detected_lang in LID_TO_TRANS2_MAPPING:
129
  src_lang_code = LID_TO_TRANS2_MAPPING[detected_lang]
130
- else:
131
- return f"Detected language '{detected_lang}' not supported", detected_lang.title(), confidence
132
 
133
- elif source_language != "Auto-detect":
134
- # Manual language selection
135
- src_lang_code = MANUAL_LANGUAGES[source_language]
136
- detected_lang = source_language.lower()
137
- confidence = 1.0
138
-
139
- else:
140
- return "❌ Please select a source language (IndicLID not available)", "", 0.0
141
-
142
- if not src_lang_code:
143
- return "❌ Could not determine source language", detected_lang, confidence
144
-
145
- # Skip if already English
146
- if detected_lang == 'english':
147
- return input_text, "English", confidence
148
-
149
- # Translation
150
- target_lang_code = "eng_Latn"
151
-
152
- # Preprocess
153
- batch = ip.preprocess_batch(
154
- [input_text],
155
- src_lang=src_lang_code,
156
- tgt_lang=target_lang_code
157
- )
158
-
159
- # Tokenize
160
- inputs = tokenizer(
161
- batch,
162
- truncation=True,
163
- padding="longest",
164
- return_tensors="pt",
165
- return_attention_mask=True
166
- ).to(device)
167
-
168
- # Generate
169
- with torch.no_grad():
170
- generated_tokens = translation_model.generate(
171
- **inputs,
172
- use_cache=True,
173
- min_length=0,
174
- max_length=256,
175
- num_beams=5,
176
- num_return_sequences=1
177
- )
178
-
179
- # Decode
180
- decoded = tokenizer.batch_decode(
181
- generated_tokens,
182
- skip_special_tokens=True,
183
- clean_up_tokenization_spaces=True
184
- )
185
-
186
- # Postprocess
187
- translations = ip.postprocess_batch(decoded, lang=target_lang_code)
188
- translation = translations[0]
189
 
190
  return translation, detected_lang.title(), confidence
191
 
192
  except Exception as e:
193
- return f"❌ Error: {str(e)}", "", 0.0
 
 
194
 
195
- # Create interface
196
  def create_interface():
197
- with gr.Blocks(title="Indic Language Translator", theme=gr.themes.Soft()) as demo:
 
 
 
 
 
198
  gr.Markdown("""
199
- # 🌍 Indic Language Translator
200
 
201
- **Powered by IndicLID + IndicTrans2**
202
 
203
- Translate Indian languages to English with automatic language detection.
 
 
 
204
  """)
205
 
206
  # Status display
207
  status_display = gr.Textbox(
208
- value="Loading models...",
209
- label="Status",
210
- interactive=False
 
211
  )
212
 
213
  with gr.Row():
214
- with gr.Column(scale=2):
215
  input_text = gr.Textbox(
216
- label="Input Text",
217
- placeholder="Enter text in any Indian language...",
218
- lines=5
 
219
  )
220
 
221
- source_lang = gr.Dropdown(
222
- choices=list(MANUAL_LANGUAGES.keys()),
223
- value="Auto-detect",
224
- label="Source Language"
225
  )
226
 
227
- translate_btn = gr.Button("🔄 Translate", variant="primary", size="lg")
228
-
229
- with gr.Column(scale=2):
230
- output_text = gr.Textbox(
231
- label="English Translation",
232
- lines=5,
233
- interactive=False
234
  )
235
 
236
  with gr.Row():
237
- detected_lang = gr.Textbox(
238
- label="Detected Language",
239
  interactive=False,
240
- scale=2
 
241
  )
242
- confidence = gr.Number(
243
- label="Confidence",
244
  interactive=False,
245
- scale=1
 
246
  )
247
 
248
- # Examples
 
249
  gr.Examples(
250
  examples=[
251
- ["मैं आज बाजार जा रहा हूं।", "Auto-detect"],
252
- ["আমি আজ বাজারে যাচ্ছি।", "Bengali"],
253
- ["நான் இன்று சந்தைக்கு போகிறேன்।", "Tamil"],
 
 
 
 
 
 
 
254
  ],
255
- inputs=[input_text, source_lang],
 
256
  )
257
 
258
- # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
259
  translate_btn.click(
260
- fn=translate_text,
261
- inputs=[input_text, source_lang],
262
- outputs=[output_text, detected_lang, confidence]
263
  )
264
 
 
265
  input_text.submit(
266
- fn=translate_text,
267
- inputs=[input_text, source_lang],
268
- outputs=[output_text, detected_lang, confidence]
269
  )
270
 
271
- # Load models and update status
272
- def update_status():
273
- status = load_models()
274
- return status
275
-
276
- demo.load(update_status, outputs=[status_display])
277
 
278
  return demo
279
 
280
  if __name__ == "__main__":
 
 
 
 
281
  demo = create_interface()
282
- demo.launch()
 
 
 
 
 
3
  import sys
4
  import os
5
  from pathlib import Path
6
+ import warnings
7
+ warnings.filterwarnings("ignore")
8
 
9
+ # Setup IndicLID if not already done
10
  if not os.path.exists("ai4bharat/IndicLID.py"):
11
+ print("🚀 Setting up IndicLID for the first time...")
12
+ exec(open("setup_indiclid.py").read())
13
 
14
+ # Import torch safe globals first
15
+ try:
16
+ exec(open("torch_safe_globals.py").read())
17
+ print("✅ Torch safe globals loaded")
18
+ except:
19
+ print("⚠️ Could not load torch safe globals")
20
+
21
+ # Add current directory to Python path
22
+ sys.path.insert(0, os.getcwd())
23
+
24
+ # Import required libraries
25
  from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
26
  from IndicTransToolkit.processor import IndicProcessor
27
 
28
+ # Import IndicLID - This is crucial for automatic language detection
 
 
 
29
  try:
30
  from ai4bharat.IndicLID import IndicLID
31
  INDICLID_AVAILABLE = True
32
+ print("✅ IndicLID imported successfully - Automatic language detection enabled")
33
  except ImportError as e:
34
+ print(f"IndicLID import failed: {e}")
35
  INDICLID_AVAILABLE = False
36
+ raise Exception("IndicLID is required for automatic language detection!")
37
 
38
  # Device setup
39
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
40
+ print(f"🔧 Using device: {device}")
41
 
42
+ # Language mapping from IndicLID output to IndicTrans2 codes
43
  LID_TO_TRANS2_MAPPING = {
44
  'hindi': 'hin_Deva',
45
  'bengali': 'ben_Beng',
 
56
  'assamese': 'asm_Beng',
57
  'kashmiri': 'kas_Arab',
58
  'sindhi': 'snd_Arab',
59
+ 'sanskrit': 'san_Deva',
60
+ 'english': 'eng_Latn'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  }
62
 
63
+ # Global model variables
64
  lid_model = None
65
  translation_model = None
66
  tokenizer = None
67
  ip = None
 
68
 
69
  def load_models():
70
+ """Load both IndicLID (for detection) and IndicTrans2 (for translation)"""
71
+ global lid_model, translation_model, tokenizer, ip
72
 
73
  try:
74
+ # Step 1: Load IndicLID for automatic language detection
75
+ print("🔍 Loading IndicLID for automatic language detection...")
76
+ lid_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
77
+ print("✅ IndicLID loaded successfully - Ready for automatic language detection!")
78
 
79
+ # Step 2: Load IndicTrans2 for translation
80
+ print("🔄 Loading IndicTrans2 for translation...")
81
  model_name = "ai4bharat/indictrans2-indic-en-1B"
82
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
83
  translation_model = AutoModelForSeq2SeqLM.from_pretrained(
 
87
  ).to(device)
88
 
89
  ip = IndicProcessor(inference=True)
90
+ print("✅ IndicTrans2 loaded successfully - Ready for translation!")
91
 
92
+ return "✅ Both models loaded successfully!\n🔍 IndicLID: Automatic language detection\n🔄 IndicTrans2: Translation to English"
 
 
 
 
 
 
 
 
 
 
93
 
94
  except Exception as e:
95
+ error_msg = f"❌ Error loading models: {str(e)}"
96
+ print(error_msg)
97
+ return error_msg
98
 
99
+ def automatic_detect_and_translate(input_text):
100
+ """
101
+ Main function: Automatic language detection using IndicLID + Translation using IndicTrans2
102
+ This is the core pipeline you requested
103
+ """
104
+ if not all([lid_model, translation_model, tokenizer, ip]):
105
+ return "❌ Models not loaded. Please wait for initialization.", "", 0.0
106
 
107
  if not input_text.strip():
108
+ return "Please enter text for automatic detection and translation.", "", 0.0
109
 
110
  try:
111
+ # STEP 1: AUTOMATIC LANGUAGE DETECTION USING INDICLID
112
+ print(f"🔍 Detecting language for: {input_text[:50]}...")
113
+ lid_result = lid_model.batch_predict([input_text])
114
+
115
+ # Extract language detection results
116
+ detected_lang = lid_result[0]['langinfo']['text_lang']
117
+ confidence = lid_result[0]['langinfo']['text_lang_score']
118
+
119
+ print(f"✅ IndicLID detected: {detected_lang} (confidence: {confidence:.3f})")
120
 
121
+ # STEP 2: TRANSLATION USING INDICTRANS2 (if not English)
122
+ if detected_lang.lower() == 'english':
123
+ translation = input_text
124
+ print("ℹ️ Text is already in English, no translation needed")
125
+ else:
126
+ # Check if detected language is supported by IndicTrans2
 
127
  if detected_lang in LID_TO_TRANS2_MAPPING:
128
  src_lang_code = LID_TO_TRANS2_MAPPING[detected_lang]
129
+ target_lang_code = "eng_Latn"
 
130
 
131
+ print(f"🔄 Translating from {src_lang_code} to {target_lang_code}...")
132
+
133
+ # Preprocess for IndicTrans2
134
+ batch = ip.preprocess_batch(
135
+ [input_text],
136
+ src_lang=src_lang_code,
137
+ tgt_lang=target_lang_code
138
+ )
139
+
140
+ # Tokenize
141
+ inputs = tokenizer(
142
+ batch,
143
+ truncation=True,
144
+ padding="longest",
145
+ return_tensors="pt",
146
+ return_attention_mask=True
147
+ ).to(device)
148
+
149
+ # Generate translation
150
+ with torch.no_grad():
151
+ generated_tokens = translation_model.generate(
152
+ **inputs,
153
+ use_cache=True,
154
+ min_length=0,
155
+ max_length=256,
156
+ num_beams=5,
157
+ num_return_sequences=1
158
+ )
159
+
160
+ # Decode translation
161
+ decoded = tokenizer.batch_decode(
162
+ generated_tokens,
163
+ skip_special_tokens=True,
164
+ clean_up_tokenization_spaces=True
165
+ )
166
+
167
+ # Postprocess
168
+ translations = ip.postprocess_batch(decoded, lang=target_lang_code)
169
+ translation = translations[0]
170
+
171
+ print(f"✅ Translation completed: {translation}")
172
+ else:
173
+ translation = f"❌ Language '{detected_lang}' not supported for translation"
174
+ print(f"⚠️ {translation}")
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
  return translation, detected_lang.title(), confidence
177
 
178
  except Exception as e:
179
+ error_msg = f"❌ Error in detection/translation pipeline: {str(e)}"
180
+ print(error_msg)
181
+ return error_msg, "", 0.0
182
 
 
183
  def create_interface():
184
+ """Create Gradio interface focused on automatic IndicLID detection + IndicTrans2 translation"""
185
+ with gr.Blocks(
186
+ title="IndicLID → IndicTrans2 Pipeline",
187
+ theme=gr.themes.Soft()
188
+ ) as demo:
189
+
190
  gr.Markdown("""
191
+ # 🔍➡️🔄 Automatic Language Detection + Translation
192
 
193
+ **Complete Pipeline: IndicLID IndicTrans2**
194
 
195
+ 1. **🔍 IndicLID**: Automatically detects your input language
196
+ 2. **🔄 IndicTrans2**: Translates to English based on detected language
197
+
198
+ **No manual language selection needed!** Just paste your text and get automatic detection + translation.
199
  """)
200
 
201
  # Status display
202
  status_display = gr.Textbox(
203
+ value="🚀 Loading IndicLID and IndicTrans2 models...",
204
+ label="🔧 Pipeline Status",
205
+ interactive=False,
206
+ lines=3
207
  )
208
 
209
  with gr.Row():
210
+ with gr.Column(scale=1):
211
  input_text = gr.Textbox(
212
+ label="📝 Input Text (Any Indian Language)",
213
+ placeholder="Enter text in Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Urdu, etc...\n\nIndicLID will automatically detect the language!",
214
+ lines=6,
215
+ max_lines=10
216
  )
217
 
218
+ translate_btn = gr.Button(
219
+ "🔍➡️🔄 Auto-Detect & Translate",
220
+ variant="primary",
221
+ size="lg"
222
  )
223
 
224
+ with gr.Column(scale=1):
225
+ translation_output = gr.Textbox(
226
+ label="🇬🇧 English Translation",
227
+ lines=6,
228
+ max_lines=10,
229
+ interactive=False,
230
+ placeholder="Automatic translation will appear here..."
231
  )
232
 
233
  with gr.Row():
234
+ detected_language = gr.Textbox(
235
+ label="🌐 Auto-Detected Language",
236
  interactive=False,
237
+ scale=2,
238
+ placeholder="Language will be detected automatically"
239
  )
240
+ confidence_score = gr.Number(
241
+ label="📊 Detection Confidence",
242
  interactive=False,
243
+ scale=1,
244
+ precision=3
245
  )
246
 
247
+ # Examples showcasing automatic detection
248
+ gr.Markdown("### 📖 Try These Examples (Automatic Detection!):")
249
  gr.Examples(
250
  examples=[
251
+ ["मैं आज बाजार जा रहा हूं।"], # Hindi
252
+ ["আমি আজ বাজারে যাচ্ছি।"], # Bengali
253
+ ["நான் இன்று சந்தைக்கு போகிறேன்।"], # Tamil
254
+ ["ನಾನು ಇಂದು ಮಾರುಕಟ್ಟೆಗೆ ಹೋಗುತ್ತಿದ್ದೇನೆ।"], # Kannada
255
+ ["હું આજે બજારમાં જાઉં છું।"], # Gujarati
256
+ ["मी आज बाजारात जात आहे।"], # Marathi
257
+ ["میں آج بازار جا رہا ہوں۔"], # Urdu
258
+ ["ਮੈਂ ਅੱਜ ਬਾਜ਼ਾਰ ਜਾ ਰਿਹਾ ਹਾਂ।"], # Punjabi
259
+ ["నేను ఈరోజు మార్కెట్‌కి వెళ్తున్నాను।"], # Telugu
260
+ ["ഞാൻ ഇന്ന് മാർക്കറ്റിൽ പോകുന്നു।"] # Malayalam
261
  ],
262
+ inputs=[input_text],
263
+ label="Click any example to test automatic detection!"
264
  )
265
 
266
+ # Information about supported languages
267
+ gr.Markdown("""
268
+ ### 🌐 Supported Languages for Auto-Detection:
269
+ **IndicLID can automatically detect:** Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi,
270
+ Punjabi, Urdu, Odia, Assamese, Nepali, Kashmiri, Sindhi, Sanskrit, and English.
271
+
272
+ ### ✨ How it works:
273
+ 1. You paste text in **any** supported Indian language
274
+ 2. **IndicLID** automatically identifies the language (no manual selection!)
275
+ 3. **IndicTrans2** translates it to English based on the detected language
276
+ """)
277
+
278
+ # Event handlers for automatic detection + translation
279
  translate_btn.click(
280
+ fn=automatic_detect_and_translate,
281
+ inputs=[input_text],
282
+ outputs=[translation_output, detected_language, confidence_score]
283
  )
284
 
285
+ # Auto-submit on Enter key
286
  input_text.submit(
287
+ fn=automatic_detect_and_translate,
288
+ inputs=[input_text],
289
+ outputs=[translation_output, detected_language, confidence_score]
290
  )
291
 
292
+ # Load models on startup
293
+ demo.load(load_models, outputs=[status_display])
 
 
 
 
294
 
295
  return demo
296
 
297
  if __name__ == "__main__":
298
+ print("🚀 Starting IndicLID → IndicTrans2 Automatic Pipeline")
299
+ print("🔍 IndicLID will handle automatic language detection")
300
+ print("🔄 IndicTrans2 will handle translation to English")
301
+
302
  demo = create_interface()
303
+ demo.launch(
304
+ server_name="0.0.0.0",
305
+ server_port=7860,
306
+ share=True
307
+ )