Noumida commited on
Commit
f14831e
·
verified ·
1 Parent(s): df13707

Update ai4bharat/IndicLID.py

Browse files
Files changed (1) hide show
  1. ai4bharat/IndicLID.py +53 -32
ai4bharat/IndicLID.py CHANGED
@@ -43,16 +43,13 @@ class IndicLID():
43
  self.IndicLID_FTR_path = "models/indiclid-ftr.bin"
44
  self.IndicLID_BERT_path = "models/indiclid-bert"
45
 
46
- # Language mappings
47
  self.IndicLID_labels = [
48
- 'asm_Beng', 'asm_Latn', 'ben_Beng', 'ben_Latn', 'brx_Deva', 'brx_Latn',
49
- 'doi_Deva', 'doi_Latn', 'eng_Latn', 'guj_Gujr', 'guj_Latn', 'hin_Deva',
50
- 'hin_Latn', 'kan_Knda', 'kan_Latn', 'kas_Arab', 'kas_Deva', 'kas_Latn',
51
- 'kok_Deva', 'kok_Latn', 'mai_Deva', 'mai_Latn', 'mal_Mlym', 'mal_Latn',
52
- 'mni_Beng', 'mni_Meti', 'mni_Latn', 'mar_Deva', 'mar_Latn', 'nep_Deva',
53
- 'nep_Latn', 'ori_Orya', 'ori_Latn', 'pan_Guru', 'pan_Latn', 'san_Deva',
54
- 'san_Latn', 'sat_Olch', 'snd_Arab', 'snd_Latn', 'tam_Taml', 'tam_Latn',
55
- 'tel_Telu', 'tel_Latn', 'urd_Arab', 'urd_Latn', 'other'
56
  ]
57
 
58
  # Load models
@@ -107,7 +104,7 @@ class IndicLID():
107
  output_dict[idx] = (text, label, score, 'IndicLID-FTN')
108
  else:
109
  # Fallback - simple heuristic based on script
110
- detected_lang = self.detect_script_simple(text)
111
  output_dict[idx] = (text, detected_lang, 0.8, 'Script-based')
112
 
113
  return output_dict
@@ -170,12 +167,16 @@ class IndicLID():
170
  print(f"BERT inference error: {e}")
171
  return 'eng_Latn', 0.5
172
 
173
- def detect_script_simple(self, text):
174
- """Simple script detection based on Unicode ranges"""
175
- # Check for common Indian scripts
176
  if any(ord(char) >= 0x0900 and ord(char) <= 0x097F for char in text): # Devanagari
 
 
177
  return 'hin_Deva'
178
  elif any(ord(char) >= 0x0980 and ord(char) <= 0x09FF for char in text): # Bengali
 
 
179
  return 'ben_Beng'
180
  elif any(ord(char) >= 0x0B80 and ord(char) <= 0x0BFF for char in text): # Tamil
181
  return 'tam_Taml'
@@ -189,6 +190,15 @@ class IndicLID():
189
  return 'guj_Gujr'
190
  elif any(ord(char) >= 0x0A00 and ord(char) <= 0x0A7F for char in text): # Gurmukhi (Punjabi)
191
  return 'pan_Guru'
 
 
 
 
 
 
 
 
 
192
  else:
193
  return 'eng_Latn' # Default to English
194
 
@@ -232,7 +242,8 @@ class IndicLID():
232
  'text_lang': lang_name,
233
  'text_lang_score': confidence,
234
  'script': 'native' if not self.is_roman_text(sentence) else 'roman',
235
- 'model_used': model_used
 
236
  }
237
  }
238
  results.append(result_dict)
@@ -240,25 +251,35 @@ class IndicLID():
240
  return results
241
 
242
  def code_to_language(self, lang_code):
243
- """Convert language code to language name"""
244
  code_to_lang = {
245
- 'hin_Deva': 'hindi', 'hin_Latn': 'hindi',
246
- 'ben_Beng': 'bengali', 'ben_Latn': 'bengali',
247
- 'tam_Taml': 'tamil', 'tam_Latn': 'tamil',
248
- 'tel_Telu': 'telugu', 'tel_Latn': 'telugu',
249
- 'kan_Knda': 'kannada', 'kan_Latn': 'kannada',
250
- 'mal_Mlym': 'malayalam', 'mal_Latn': 'malayalam',
251
- 'guj_Gujr': 'gujarati', 'guj_Latn': 'gujarati',
252
- 'pan_Guru': 'punjabi', 'pan_Latn': 'punjabi',
253
- 'mar_Deva': 'marathi', 'mar_Latn': 'marathi',
254
- 'urd_Arab': 'urdu', 'urd_Latn': 'urdu',
255
- 'ori_Orya': 'odia', 'ori_Latn': 'odia',
256
- 'asm_Beng': 'assamese', 'asm_Latn': 'assamese',
257
- 'nep_Deva': 'nepali', 'nep_Latn': 'nepali',
258
- 'kas_Arab': 'kashmiri', 'kas_Deva': 'kashmiri', 'kas_Latn': 'kashmiri',
259
- 'snd_Arab': 'sindhi', 'snd_Latn': 'sindhi',
260
- 'san_Deva': 'sanskrit', 'san_Latn': 'sanskrit',
 
 
 
 
 
 
 
 
 
 
261
  'eng_Latn': 'english',
262
  'other': 'other'
263
  }
264
- return code_to_lang.get(lang_code, lang_code.split('_')[0])
 
43
  self.IndicLID_FTR_path = "models/indiclid-ftr.bin"
44
  self.IndicLID_BERT_path = "models/indiclid-bert"
45
 
46
+ # Updated language labels - Complete list as per your specification
47
  self.IndicLID_labels = [
48
+ 'asm_Beng', 'ben_Beng', 'brx_Deva', 'doi_Deva', 'gom_Deva', 'guj_Gujr',
49
+ 'hin_Deva', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'mai_Deva', 'mal_Mlym',
50
+ 'mni_Beng', 'mni_Mtei', 'mar_Deva', 'npi_Deva', 'ory_Orya', 'pan_Guru',
51
+ 'san_Deva', 'sat_Olck', 'snd_Arab', 'snd_Deva', 'tam_Taml', 'tel_Telu',
52
+ 'urd_Arab', 'eng_Latn', 'other'
 
 
 
53
  ]
54
 
55
  # Load models
 
104
  output_dict[idx] = (text, label, score, 'IndicLID-FTN')
105
  else:
106
  # Fallback - simple heuristic based on script
107
+ detected_lang = self.detect_script_enhanced(text)
108
  output_dict[idx] = (text, detected_lang, 0.8, 'Script-based')
109
 
110
  return output_dict
 
167
  print(f"BERT inference error: {e}")
168
  return 'eng_Latn', 0.5
169
 
170
+ def detect_script_enhanced(self, text):
171
+ """Enhanced script detection based on Unicode ranges for all supported languages"""
172
+ # Check for various Indian scripts
173
  if any(ord(char) >= 0x0900 and ord(char) <= 0x097F for char in text): # Devanagari
174
+ # Could be Hindi, Marathi, Sanskrit, Nepali, Dogri, Maithili, Sindhi
175
+ # Simple heuristic - default to Hindi for Devanagari
176
  return 'hin_Deva'
177
  elif any(ord(char) >= 0x0980 and ord(char) <= 0x09FF for char in text): # Bengali
178
+ # Could be Bengali, Assamese, Manipuri
179
+ # Default to Bengali
180
  return 'ben_Beng'
181
  elif any(ord(char) >= 0x0B80 and ord(char) <= 0x0BFF for char in text): # Tamil
182
  return 'tam_Taml'
 
190
  return 'guj_Gujr'
191
  elif any(ord(char) >= 0x0A00 and ord(char) <= 0x0A7F for char in text): # Gurmukhi (Punjabi)
192
  return 'pan_Guru'
193
+ elif any(ord(char) >= 0x0B00 and ord(char) <= 0x0B7F for char in text): # Odia
194
+ return 'ory_Orya'
195
+ elif any(ord(char) >= 0x0600 and ord(char) <= 0x06FF for char in text): # Arabic script
196
+ # Could be Urdu, Kashmiri, Sindhi
197
+ return 'urd_Arab'
198
+ elif any(ord(char) >= 0x1C00 and ord(char) <= 0x1C4F for char in text): # Ol Chiki (Santali)
199
+ return 'sat_Olck'
200
+ elif any(ord(char) >= 0xAAE0 and ord(char) <= 0xAAFF for char in text): # Meetei Mayek (Manipuri)
201
+ return 'mni_Mtei'
202
  else:
203
  return 'eng_Latn' # Default to English
204
 
 
242
  'text_lang': lang_name,
243
  'text_lang_score': confidence,
244
  'script': 'native' if not self.is_roman_text(sentence) else 'roman',
245
+ 'model_used': model_used,
246
+ 'lang_code': lang_code
247
  }
248
  }
249
  results.append(result_dict)
 
251
  return results
252
 
253
  def code_to_language(self, lang_code):
254
+ """Convert language code to language name - Updated with all 26 languages"""
255
  code_to_lang = {
256
+ # Complete mapping for all supported languages
257
+ 'asm_Beng': 'assamese',
258
+ 'ben_Beng': 'bengali',
259
+ 'brx_Deva': 'bodo',
260
+ 'doi_Deva': 'dogri',
261
+ 'gom_Deva': 'konkani', # Goan Konkani
262
+ 'guj_Gujr': 'gujarati',
263
+ 'hin_Deva': 'hindi',
264
+ 'kan_Knda': 'kannada',
265
+ 'kas_Arab': 'kashmiri',
266
+ 'kas_Deva': 'kashmiri',
267
+ 'mai_Deva': 'maithili',
268
+ 'mal_Mlym': 'malayalam',
269
+ 'mni_Beng': 'manipuri',
270
+ 'mni_Mtei': 'manipuri',
271
+ 'mar_Deva': 'marathi',
272
+ 'npi_Deva': 'nepali',
273
+ 'ory_Orya': 'odia',
274
+ 'pan_Guru': 'punjabi',
275
+ 'san_Deva': 'sanskrit',
276
+ 'sat_Olck': 'santali',
277
+ 'snd_Arab': 'sindhi',
278
+ 'snd_Deva': 'sindhi',
279
+ 'tam_Taml': 'tamil',
280
+ 'tel_Telu': 'telugu',
281
+ 'urd_Arab': 'urdu',
282
  'eng_Latn': 'english',
283
  'other': 'other'
284
  }
285
+ return code_to_lang.get(lang_code, lang_code.split('_')[0] if '_' in lang_code else lang_code)