Spaces:

Noumida
/

indic-lid_trans2

Sleeping

App Files Files Community

indic-lid_trans2 / app.py

Noumida

Update app.py

b7caabd verified 4 months ago

raw

history blame

12.3 kB

	import gradio as gr
	import torch
	import sys
	import os
	from pathlib import Path
	import warnings
	warnings.filterwarnings("ignore")

	# Setup IndicLID if not already done
	if not os.path.exists("ai4bharat/IndicLID.py"):
	print("🚀 Setting up IndicLID for the first time...")
	exec(open("setup_indiclid.py").read())

	# Import torch safe globals first
	try:
	exec(open("torch_safe_globals.py").read())
	print("✅ Torch safe globals loaded")
	except:
	print("⚠️ Could not load torch safe globals")

	# Add current directory to Python path
	sys.path.insert(0, os.getcwd())

	# Import required libraries
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	from IndicTransToolkit.processor import IndicProcessor

	# Import IndicLID - This is crucial for automatic language detection
	try:
	from ai4bharat.IndicLID import IndicLID
	INDICLID_AVAILABLE = True
	print("✅ IndicLID imported successfully - Automatic language detection enabled")
	except ImportError as e:
	print(f"❌ IndicLID import failed: {e}")
	INDICLID_AVAILABLE = False
	raise Exception("IndicLID is required for automatic language detection!")

	# Device setup
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"🔧 Using device: {device}")

	# Language mapping from IndicLID output to IndicTrans2 codes
	LID_TO_TRANS2_MAPPING = {
	'hindi': 'hin_Deva',
	'bengali': 'ben_Beng',
	'gujarati': 'guj_Gujr',
	'kannada': 'kan_Knda',
	'malayalam': 'mal_Mlym',
	'marathi': 'mar_Deva',
	'nepali': 'npi_Deva',
	'odia': 'ory_Orya',
	'punjabi': 'pan_Guru',
	'tamil': 'tam_Taml',
	'telugu': 'tel_Telu',
	'urdu': 'urd_Arab',
	'assamese': 'asm_Beng',
	'kashmiri': 'kas_Arab',
	'sindhi': 'snd_Arab',
	'sanskrit': 'san_Deva',
	'english': 'eng_Latn'
	}

	# Global model variables
	lid_model = None
	translation_model = None
	tokenizer = None
	ip = None

	def load_models():
	"""Load both IndicLID (for detection) and IndicTrans2 (for translation)"""
	global lid_model, translation_model, tokenizer, ip

	try:
	# Step 1: Load IndicLID for automatic language detection
	print("🔍 Loading IndicLID for automatic language detection...")
	lid_model = IndicLID(input_threshold=0.5, roman_lid_threshold=0.6)
	print("✅ IndicLID loaded successfully - Ready for automatic language detection!")

	# Step 2: Load IndicTrans2 for translation
	print("🔄 Loading IndicTrans2 for translation...")
	model_name = "ai4bharat/indictrans2-indic-en-1B"
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	translation_model = AutoModelForSeq2SeqLM.from_pretrained(
	model_name,
	trust_remote_code=True,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
	).to(device)

	ip = IndicProcessor(inference=True)
	print("✅ IndicTrans2 loaded successfully - Ready for translation!")

	return "✅ Both models loaded successfully!\n🔍 IndicLID: Automatic language detection\n🔄 IndicTrans2: Translation to English"

	except Exception as e:
	error_msg = f"❌ Error loading models: {str(e)}"
	print(error_msg)
	return error_msg

	def automatic_detect_and_translate(input_text):
	"""
	Main function: Automatic language detection using IndicLID + Translation using IndicTrans2
	This is the core pipeline you requested
	"""
	if not all([lid_model, translation_model, tokenizer, ip]):
	return "❌ Models not loaded. Please wait for initialization.", "", 0.0

	if not input_text.strip():
	return "Please enter text for automatic detection and translation.", "", 0.0

	try:
	# STEP 1: AUTOMATIC LANGUAGE DETECTION USING INDICLID
	print(f"🔍 Detecting language for: {input_text[:50]}...")
	lid_result = lid_model.batch_predict([input_text])

	# Extract language detection results
	detected_lang = lid_result[0]['langinfo']['text_lang']
	confidence = lid_result[0]['langinfo']['text_lang_score']

	print(f"✅ IndicLID detected: {detected_lang} (confidence: {confidence:.3f})")

	# STEP 2: TRANSLATION USING INDICTRANS2 (if not English)
	if detected_lang.lower() == 'english':
	translation = input_text
	print("ℹ️ Text is already in English, no translation needed")
	else:
	# Check if detected language is supported by IndicTrans2
	if detected_lang in LID_TO_TRANS2_MAPPING:
	src_lang_code = LID_TO_TRANS2_MAPPING[detected_lang]
	target_lang_code = "eng_Latn"

	print(f"🔄 Translating from {src_lang_code} to {target_lang_code}...")

	# Preprocess for IndicTrans2
	batch = ip.preprocess_batch(
	[input_text],
	src_lang=src_lang_code,
	tgt_lang=target_lang_code
	)

	# Tokenize
	inputs = tokenizer(
	batch,
	truncation=True,
	padding="longest",
	return_tensors="pt",
	return_attention_mask=True
	).to(device)

	# Generate translation
	with torch.no_grad():
	generated_tokens = translation_model.generate(
	**inputs,
	use_cache=True,
	min_length=0,
	max_length=256,
	num_beams=5,
	num_return_sequences=1
	)

	# Decode translation
	decoded = tokenizer.batch_decode(
	generated_tokens,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True
	)

	# Postprocess
	translations = ip.postprocess_batch(decoded, lang=target_lang_code)
	translation = translations[0]

	print(f"✅ Translation completed: {translation}")
	else:
	translation = f"❌ Language '{detected_lang}' not supported for translation"
	print(f"⚠️ {translation}")

	return translation, detected_lang.title(), confidence

	except Exception as e:
	error_msg = f"❌ Error in detection/translation pipeline: {str(e)}"
	print(error_msg)
	return error_msg, "", 0.0

	def create_interface():
	"""Create Gradio interface focused on automatic IndicLID detection + IndicTrans2 translation"""
	with gr.Blocks(
	title="IndicLID → IndicTrans2 Pipeline",
	theme=gr.themes.Soft()
	) as demo:

	gr.Markdown("""
	# 🔍➡️🔄 Automatic Language Detection + Translation

	Complete Pipeline: IndicLID → IndicTrans2

	1. 🔍 IndicLID: Automatically detects your input language
	2. 🔄 IndicTrans2: Translates to English based on detected language

	No manual language selection needed! Just paste your text and get automatic detection + translation.
	""")

	# Status display
	status_display = gr.Textbox(
	value="🚀 Loading IndicLID and IndicTrans2 models...",
	label="🔧 Pipeline Status",
	interactive=False,
	lines=3
	)

	with gr.Row():
	with gr.Column(scale=1):
	input_text = gr.Textbox(
	label="📝 Input Text (Any Indian Language)",
	placeholder="Enter text in Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi, Punjabi, Urdu, etc...\n\nIndicLID will automatically detect the language!",
	lines=6,
	max_lines=10
	)

	translate_btn = gr.Button(
	"🔍➡️🔄 Auto-Detect & Translate",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	translation_output = gr.Textbox(
	label="🇬🇧 English Translation",
	lines=6,
	max_lines=10,
	interactive=False,
	placeholder="Automatic translation will appear here..."
	)

	with gr.Row():
	detected_language = gr.Textbox(
	label="🌐 Auto-Detected Language",
	interactive=False,
	scale=2,
	placeholder="Language will be detected automatically"
	)
	confidence_score = gr.Number(
	label="📊 Detection Confidence",
	interactive=False,
	scale=1,
	precision=3
	)

	# Examples showcasing automatic detection
	gr.Markdown("### 📖 Try These Examples (Automatic Detection!):")
	gr.Examples(
	examples=[
	["मैं आज बाजार जा रहा हूं।"], # Hindi
	["আমি আজ বাজারে যাচ্ছি।"], # Bengali
	["நான் இன்று சந்தைக்கு போகிறேன்।"], # Tamil
	["ನಾನು ಇಂದು ಮಾರುಕಟ್ಟೆಗೆ ಹೋಗುತ್ತಿದ್ದೇನೆ।"], # Kannada
	["હું આજે બજારમાં જાઉં છું।"], # Gujarati
	["मी आज बाजारात जात आहे।"], # Marathi
	["میں آج بازار جا رہا ہوں۔"], # Urdu
	["ਮੈਂ ਅੱਜ ਬਾਜ਼ਾਰ ਜਾ ਰਿਹਾ ਹਾਂ।"], # Punjabi
	["నేను ఈరోజు మార్కెట్‌కి వెళ్తున్నాను।"], # Telugu
	["ഞാൻ ഇന്ന് മാർക്കറ്റിൽ പോകുന്നു।"] # Malayalam
	],
	inputs=[input_text],
	label="Click any example to test automatic detection!"
	)

	# Information about supported languages
	gr.Markdown("""
	### 🌐 Supported Languages for Auto-Detection:
	IndicLID can automatically detect: Hindi, Bengali, Tamil, Telugu, Gujarati, Kannada, Malayalam, Marathi,
	Punjabi, Urdu, Odia, Assamese, Nepali, Kashmiri, Sindhi, Sanskrit, and English.

	### ✨ How it works:
	1. You paste text in any supported Indian language
	2. IndicLID automatically identifies the language (no manual selection!)
	3. IndicTrans2 translates it to English based on the detected language
	""")

	# Event handlers for automatic detection + translation
	translate_btn.click(
	fn=automatic_detect_and_translate,
	inputs=[input_text],
	outputs=[translation_output, detected_language, confidence_score]
	)

	# Auto-submit on Enter key
	input_text.submit(
	fn=automatic_detect_and_translate,
	inputs=[input_text],
	outputs=[translation_output, detected_language, confidence_score]
	)

	# Load models on startup
	demo.load(load_models, outputs=[status_display])

	return demo

	if __name__ == "__main__":
	print("🚀 Starting IndicLID → IndicTrans2 Automatic Pipeline")
	print("🔍 IndicLID will handle automatic language detection")
	print("🔄 IndicTrans2 will handle translation to English")

	demo = create_interface()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True
	)