Spaces:
Sleeping
Sleeping
Update ai4bharat/IndicLID.py
Browse files- ai4bharat/IndicLID.py +53 -32
ai4bharat/IndicLID.py
CHANGED
|
@@ -43,16 +43,13 @@ class IndicLID():
|
|
| 43 |
self.IndicLID_FTR_path = "models/indiclid-ftr.bin"
|
| 44 |
self.IndicLID_BERT_path = "models/indiclid-bert"
|
| 45 |
|
| 46 |
-
#
|
| 47 |
self.IndicLID_labels = [
|
| 48 |
-
'asm_Beng', '
|
| 49 |
-
'
|
| 50 |
-
'
|
| 51 |
-
'
|
| 52 |
-
'
|
| 53 |
-
'nep_Latn', 'ori_Orya', 'ori_Latn', 'pan_Guru', 'pan_Latn', 'san_Deva',
|
| 54 |
-
'san_Latn', 'sat_Olch', 'snd_Arab', 'snd_Latn', 'tam_Taml', 'tam_Latn',
|
| 55 |
-
'tel_Telu', 'tel_Latn', 'urd_Arab', 'urd_Latn', 'other'
|
| 56 |
]
|
| 57 |
|
| 58 |
# Load models
|
|
@@ -107,7 +104,7 @@ class IndicLID():
|
|
| 107 |
output_dict[idx] = (text, label, score, 'IndicLID-FTN')
|
| 108 |
else:
|
| 109 |
# Fallback - simple heuristic based on script
|
| 110 |
-
detected_lang = self.
|
| 111 |
output_dict[idx] = (text, detected_lang, 0.8, 'Script-based')
|
| 112 |
|
| 113 |
return output_dict
|
|
@@ -170,12 +167,16 @@ class IndicLID():
|
|
| 170 |
print(f"BERT inference error: {e}")
|
| 171 |
return 'eng_Latn', 0.5
|
| 172 |
|
| 173 |
-
def
|
| 174 |
-
"""
|
| 175 |
-
# Check for
|
| 176 |
if any(ord(char) >= 0x0900 and ord(char) <= 0x097F for char in text): # Devanagari
|
|
|
|
|
|
|
| 177 |
return 'hin_Deva'
|
| 178 |
elif any(ord(char) >= 0x0980 and ord(char) <= 0x09FF for char in text): # Bengali
|
|
|
|
|
|
|
| 179 |
return 'ben_Beng'
|
| 180 |
elif any(ord(char) >= 0x0B80 and ord(char) <= 0x0BFF for char in text): # Tamil
|
| 181 |
return 'tam_Taml'
|
|
@@ -189,6 +190,15 @@ class IndicLID():
|
|
| 189 |
return 'guj_Gujr'
|
| 190 |
elif any(ord(char) >= 0x0A00 and ord(char) <= 0x0A7F for char in text): # Gurmukhi (Punjabi)
|
| 191 |
return 'pan_Guru'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
else:
|
| 193 |
return 'eng_Latn' # Default to English
|
| 194 |
|
|
@@ -232,7 +242,8 @@ class IndicLID():
|
|
| 232 |
'text_lang': lang_name,
|
| 233 |
'text_lang_score': confidence,
|
| 234 |
'script': 'native' if not self.is_roman_text(sentence) else 'roman',
|
| 235 |
-
'model_used': model_used
|
|
|
|
| 236 |
}
|
| 237 |
}
|
| 238 |
results.append(result_dict)
|
|
@@ -240,25 +251,35 @@ class IndicLID():
|
|
| 240 |
return results
|
| 241 |
|
| 242 |
def code_to_language(self, lang_code):
|
| 243 |
-
"""Convert language code to language name"""
|
| 244 |
code_to_lang = {
|
| 245 |
-
|
| 246 |
-
'
|
| 247 |
-
'
|
| 248 |
-
'
|
| 249 |
-
'
|
| 250 |
-
'
|
| 251 |
-
'guj_Gujr': 'gujarati',
|
| 252 |
-
'
|
| 253 |
-
'
|
| 254 |
-
'
|
| 255 |
-
'
|
| 256 |
-
'
|
| 257 |
-
'
|
| 258 |
-
'
|
| 259 |
-
'
|
| 260 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
'eng_Latn': 'english',
|
| 262 |
'other': 'other'
|
| 263 |
}
|
| 264 |
-
return code_to_lang.get(lang_code, lang_code.split('_')[0])
|
|
|
|
| 43 |
self.IndicLID_FTR_path = "models/indiclid-ftr.bin"
|
| 44 |
self.IndicLID_BERT_path = "models/indiclid-bert"
|
| 45 |
|
| 46 |
+
# Updated language labels - Complete list as per your specification
|
| 47 |
self.IndicLID_labels = [
|
| 48 |
+
'asm_Beng', 'ben_Beng', 'brx_Deva', 'doi_Deva', 'gom_Deva', 'guj_Gujr',
|
| 49 |
+
'hin_Deva', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'mai_Deva', 'mal_Mlym',
|
| 50 |
+
'mni_Beng', 'mni_Mtei', 'mar_Deva', 'npi_Deva', 'ory_Orya', 'pan_Guru',
|
| 51 |
+
'san_Deva', 'sat_Olck', 'snd_Arab', 'snd_Deva', 'tam_Taml', 'tel_Telu',
|
| 52 |
+
'urd_Arab', 'eng_Latn', 'other'
|
|
|
|
|
|
|
|
|
|
| 53 |
]
|
| 54 |
|
| 55 |
# Load models
|
|
|
|
| 104 |
output_dict[idx] = (text, label, score, 'IndicLID-FTN')
|
| 105 |
else:
|
| 106 |
# Fallback - simple heuristic based on script
|
| 107 |
+
detected_lang = self.detect_script_enhanced(text)
|
| 108 |
output_dict[idx] = (text, detected_lang, 0.8, 'Script-based')
|
| 109 |
|
| 110 |
return output_dict
|
|
|
|
| 167 |
print(f"BERT inference error: {e}")
|
| 168 |
return 'eng_Latn', 0.5
|
| 169 |
|
| 170 |
+
def detect_script_enhanced(self, text):
|
| 171 |
+
"""Enhanced script detection based on Unicode ranges for all supported languages"""
|
| 172 |
+
# Check for various Indian scripts
|
| 173 |
if any(ord(char) >= 0x0900 and ord(char) <= 0x097F for char in text): # Devanagari
|
| 174 |
+
# Could be Hindi, Marathi, Sanskrit, Nepali, Dogri, Maithili, Sindhi
|
| 175 |
+
# Simple heuristic - default to Hindi for Devanagari
|
| 176 |
return 'hin_Deva'
|
| 177 |
elif any(ord(char) >= 0x0980 and ord(char) <= 0x09FF for char in text): # Bengali
|
| 178 |
+
# Could be Bengali, Assamese, Manipuri
|
| 179 |
+
# Default to Bengali
|
| 180 |
return 'ben_Beng'
|
| 181 |
elif any(ord(char) >= 0x0B80 and ord(char) <= 0x0BFF for char in text): # Tamil
|
| 182 |
return 'tam_Taml'
|
|
|
|
| 190 |
return 'guj_Gujr'
|
| 191 |
elif any(ord(char) >= 0x0A00 and ord(char) <= 0x0A7F for char in text): # Gurmukhi (Punjabi)
|
| 192 |
return 'pan_Guru'
|
| 193 |
+
elif any(ord(char) >= 0x0B00 and ord(char) <= 0x0B7F for char in text): # Odia
|
| 194 |
+
return 'ory_Orya'
|
| 195 |
+
elif any(ord(char) >= 0x0600 and ord(char) <= 0x06FF for char in text): # Arabic script
|
| 196 |
+
# Could be Urdu, Kashmiri, Sindhi
|
| 197 |
+
return 'urd_Arab'
|
| 198 |
+
elif any(ord(char) >= 0x1C00 and ord(char) <= 0x1C4F for char in text): # Ol Chiki (Santali)
|
| 199 |
+
return 'sat_Olck'
|
| 200 |
+
elif any(ord(char) >= 0xAAE0 and ord(char) <= 0xAAFF for char in text): # Meetei Mayek (Manipuri)
|
| 201 |
+
return 'mni_Mtei'
|
| 202 |
else:
|
| 203 |
return 'eng_Latn' # Default to English
|
| 204 |
|
|
|
|
| 242 |
'text_lang': lang_name,
|
| 243 |
'text_lang_score': confidence,
|
| 244 |
'script': 'native' if not self.is_roman_text(sentence) else 'roman',
|
| 245 |
+
'model_used': model_used,
|
| 246 |
+
'lang_code': lang_code
|
| 247 |
}
|
| 248 |
}
|
| 249 |
results.append(result_dict)
|
|
|
|
| 251 |
return results
|
| 252 |
|
| 253 |
def code_to_language(self, lang_code):
|
| 254 |
+
"""Convert language code to language name - Updated with all 26 languages"""
|
| 255 |
code_to_lang = {
|
| 256 |
+
# Complete mapping for all supported languages
|
| 257 |
+
'asm_Beng': 'assamese',
|
| 258 |
+
'ben_Beng': 'bengali',
|
| 259 |
+
'brx_Deva': 'bodo',
|
| 260 |
+
'doi_Deva': 'dogri',
|
| 261 |
+
'gom_Deva': 'konkani', # Goan Konkani
|
| 262 |
+
'guj_Gujr': 'gujarati',
|
| 263 |
+
'hin_Deva': 'hindi',
|
| 264 |
+
'kan_Knda': 'kannada',
|
| 265 |
+
'kas_Arab': 'kashmiri',
|
| 266 |
+
'kas_Deva': 'kashmiri',
|
| 267 |
+
'mai_Deva': 'maithili',
|
| 268 |
+
'mal_Mlym': 'malayalam',
|
| 269 |
+
'mni_Beng': 'manipuri',
|
| 270 |
+
'mni_Mtei': 'manipuri',
|
| 271 |
+
'mar_Deva': 'marathi',
|
| 272 |
+
'npi_Deva': 'nepali',
|
| 273 |
+
'ory_Orya': 'odia',
|
| 274 |
+
'pan_Guru': 'punjabi',
|
| 275 |
+
'san_Deva': 'sanskrit',
|
| 276 |
+
'sat_Olck': 'santali',
|
| 277 |
+
'snd_Arab': 'sindhi',
|
| 278 |
+
'snd_Deva': 'sindhi',
|
| 279 |
+
'tam_Taml': 'tamil',
|
| 280 |
+
'tel_Telu': 'telugu',
|
| 281 |
+
'urd_Arab': 'urdu',
|
| 282 |
'eng_Latn': 'english',
|
| 283 |
'other': 'other'
|
| 284 |
}
|
| 285 |
+
return code_to_lang.get(lang_code, lang_code.split('_')[0] if '_' in lang_code else lang_code)
|