Spaces:

nisacayir
/

dialect-map-turkiye

Paused

App Files Files Community

dialect-map-turkiye / app.py

nisacayir

update 4 111lbs

4f03bf3 verified 20 days ago

raw

history blame contribute delete

50.3 kB

	# =========================================
	# ENV FIXES
	# =========================================
	import os

	os.environ["OMP_NUM_THREADS"] = "1" # libgomp hatası fix

	# =========================================
	# IMPORTS
	# =========================================
	import logging
	from pathlib import Path
	from typing import Dict, Tuple, List, Optional, Any
	from collections import defaultdict
	from uuid import uuid4

	import numpy as np
	import librosa
	import soundfile as sf
	import torch
	from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoFeatureExtractor, AutoModel
	import plotly.graph_objects as go
	import plotly.express as px
	import pandas as pd

	import gradio as gr
	from elevenlabs import ElevenLabs, save

	from regions_geojson import TURKEY_REGIONS_GEOJSON

	# =========================================
	# LOGGING SETUP
	# =========================================
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
	)
	logger = logging.getLogger(__name__)

	# =========================================
	# CONSTANTS
	# =========================================
	MODEL_ID = "openai/whisper-large-v3"
	MIN_AUDIO_DURATION = 3.0 # seconds
	VOWEL_SHIFT_WEIGHT = 0.35
	MARKER_WEIGHT = 0.40
	PROSODY_WEIGHT = 0.25
	FAST_TEMPO_THRESHOLD = 140.0
	SLOW_TEMPO_THRESHOLD = 80.0
	TARGET_SAMPLE_RATE = 16000
	EMBED_MODEL_ID = "facebook/wav2vec2-large-xlsr-53" # Turkish finetuning otomatik algılanır
	EMBED_SAMPLE_RATE = 16000
	DIALECT_REF_DIR = Path("data/dialects")
	ELEVENLABS_VOICE_ID = "Q5n6GDIjpN0pLOlycRFT"
	ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"

	# =========================================
	# DEVICE CONFIGURATION
	# =========================================
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32

	logger.info(f"Using device: {DEVICE}, dtype: {DTYPE}")

	# =========================================
	# MODEL INITIALIZATION
	# =========================================
	try:
	processor = AutoProcessor.from_pretrained(MODEL_ID)

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	MODEL_ID,
	torch_dtype=DTYPE
	)

	model = model.to(DEVICE)
	model.eval()

	logger.info("Model loaded successfully")
	except Exception as e:
	logger.error(f"Error loading model: {e}")
	raise

	# =========================================
	# EMBEDDING MODEL INITIALIZATION
	# Note: Embedding model is disabled - we use transcription-based dialect analysis instead
	# =========================================
	embed_feature_extractor = None
	embed_model = None
	logger.debug("Embedding model disabled - using transcription-based analysis only")

	DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = defaultdict(list)
	DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {}

	# =========================================
	# ELEVENLABS CLIENT
	# =========================================
	ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
	if ELEVENLABS_API_KEY:
	try:
	elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
	logger.info("ElevenLabs client initialized")
	except Exception as e:
	elevenlabs_client = None
	logger.warning(f"Failed to initialize ElevenLabs client: {e}")
	else:
	elevenlabs_client = None
	logger.warning("ELEVENLABS_API_KEY not found. Voice replies will be disabled.")
	# =========================================
	# DIALECT PROFILES
	# =========================================
	DIALECT_PROFILES: Dict[str, Dict[str, Any]] = {
	"Marmara": {
	"description": "İstanbul-Trakya şehir aksanı, düz prosodi.",
	"vowel_shifts": {"a→ı": 0.1, "ı→i": 0.15, "e→i": 0.15, "o→u": 0.1},
	"markers": ["abi", "aynen", "bi şey dicem", "baksana"],
	"prosody": "düz-orta",
	"tempo_range": (100, 130),
	"pitch_range": "neutral"
	},
	"Ege": {
	"description": "Melodik, uzatmalı, 'gari', 'hee' kültürüne sahip.",
	"vowel_shifts": {"e→ee": 0.85, "o→oo": 0.75, "a→aa": 0.4},
	"markers": ["gari", "hee", "ebe", "söyleyiver"],
	"prosody": "yavaş-uzatmalı",
	"tempo_range": (60, 90),
	"pitch_range": "medium"
	},
	"Akdeniz": {
	"description": "Hızlı, enerjik, 'la' baskın aksan.",
	"vowel_shifts": {"a→aa": 0.65, "ı→i": 0.35},
	"markers": ["la", "naapıyon la", "hee la"],
	"prosody": "enerjik-hızlı",
	"tempo_range": (130, 160),
	"pitch_range": "high"
	},
	"İç Anadolu": {
	"description": "Düz ritmik, ı/i kaymaları belirgin.",
	"vowel_shifts": {"ı→i": 0.7, "a→ı": 0.5, "o→u": 0.3},
	"markers": ["gelisen", "gideceksen", "hele bi dur"],
	"prosody": "düz-ritmik",
	"tempo_range": (100, 125),
	"pitch_range": "neutral"
	},
	"Karadeniz": {
	"description": "Yüksek tonlama, hızlı, ünlü daralması.",
	"vowel_shifts": {"e→i": 0.9, "ö→u": 0.8, "a→e": 0.3},
	"markers": ["ha bu", "da gel daa", "nere gideysin"],
	"prosody": "yüksek-inişli-çıkışlı",
	"tempo_range": (120, 150),
	"pitch_range": "high-oscillating"
	},
	"Doğu Anadolu": {
	"description": "Ağır tempo, geniş ünlü uzatmaları.",
	"vowel_shifts": {"ı→i": 0.75, "u→o": 0.65, "a→â": 0.4},
	"markers": ["he vallah", "gardaş", "ağabey"],
	"prosody": "düşük-ağır",
	"tempo_range": (70, 100),
	"pitch_range": "low"
	},
	"Güneydoğu Anadolu": {
	"description": "Ê/Î uzatmaları, uzun vurgu, ağır tempo.",
	"vowel_shifts": {"a→ê": 0.9, "e→ê": 0.95, "i→î": 0.6},
	"markers": ["ê", "hele", "gardaş", "bacı"],
	"prosody": "uzun-vurgulu-ağır",
	"tempo_range": (65, 95),
	"pitch_range": "low-elongated"
	}
	}

	# =========================================
	# DIALECT EMBEDDING HELPERS
	# =========================================
	REGION_ALIAS_MAP = {
	region.lower().replace(" ", ""): region
	for region in DIALECT_PROFILES.keys()
	}


	def _resolve_region_from_name(name: str) -> Optional[str]:
	key = (
	name.lower()
	.replace("-", "")
	.replace("_", "")
	.replace(" ", "")
	)
	if key in REGION_ALIAS_MAP:
	return REGION_ALIAS_MAP[key]
	for alias_key, region_name in REGION_ALIAS_MAP.items():
	if alias_key in key or key in alias_key:
	return region_name
	return None


	def embed_audio(audio_path: str) -> Optional[np.ndarray]:
	"""
	Convert an audio file into a fixed-length embedding vector.
	"""
	if embed_model is None or embed_feature_extractor is None:
	logger.warning("Embedding model unavailable; cannot embed audio.")
	return None
	try:
	audio_data, sr = sf.read(audio_path)
	if audio_data.ndim > 1:
	audio_data = np.mean(audio_data, axis=1)
	if sr != EMBED_SAMPLE_RATE:
	audio_data = librosa.resample(
	audio_data,
	orig_sr=sr,
	target_sr=EMBED_SAMPLE_RATE
	)
	sr = EMBED_SAMPLE_RATE

	inputs = embed_feature_extractor(
	audio_data,
	sampling_rate=sr,
	return_tensors="pt"
	)
	inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = embed_model(**inputs)
	hidden_states = outputs.last_hidden_state
	embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()
	return embedding
	except Exception as e:
	logger.error(f"Audio embedding failed: {e}")
	return None


	def load_reference_embeddings() -> Tuple[Dict[str, List[np.ndarray]], Dict[str, np.ndarray]]:
	"""
	Load reference embeddings for each dialect region from local wav files.
	"""
	# Check if embedding model is available (globally defined)
	try:
	if embed_model is None or embed_feature_extractor is None:
	logger.warning("Embedding model missing; reference embeddings disabled.")
	return {}, {}
	except NameError:
	# embed_model not defined - embedding model disabled
	logger.debug("Embedding model not defined; reference embeddings disabled.")
	return {}, {}

	if not DIALECT_REF_DIR.exists():
	logger.warning(f"Dialect reference directory not found: {DIALECT_REF_DIR}")
	return {}, {}

	embeddings: Dict[str, List[np.ndarray]] = defaultdict(list)
	for wav_path in sorted(DIALECT_REF_DIR.glob("*.wav")):
	region_name = _resolve_region_from_name(wav_path.stem)
	if not region_name:
	logger.debug(f"Could not resolve region for reference file {wav_path.name}")
	continue
	emb = embed_audio(str(wav_path))
	if emb is not None:
	embeddings[region_name].append(emb)

	prototypes: Dict[str, np.ndarray] = {}
	for region_name, vectors in embeddings.items():
	if vectors:
	prototypes[region_name] = np.mean(vectors, axis=0)
	logger.info(f"Loaded {len(vectors)} reference embeddings for {region_name}")

	if not prototypes:
	logger.warning("No dialect reference prototypes were built.")

	return embeddings, prototypes


	# Embedding model disabled - reference embeddings not needed
	DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = {}
	DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {}
	logger.debug("Embedding model disabled - skipping reference embeddings loading")


	# =========================================
	# ZERO-SHOT DIALECT CLASSIFIER
	# =========================================
	def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float:
	denom = (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)) + 1e-10
	return float(np.dot(vec_a, vec_b) / denom)


	def predict_dialect(audio_path: str) -> Tuple[str, Dict[str, float]]:
	"""
	Predict dialect region using cosine similarity against reference prototypes.
	"""
	if not DIALECT_PROTOTYPES:
	logger.warning("No dialect prototypes available; returning fallback prediction.")
	return "Bilinmiyor", {}

	user_embedding = embed_audio(audio_path)
	if user_embedding is None:
	return "Bilinmiyor", {}

	scores: Dict[str, float] = {}
	for region_name, prototype_vec in DIALECT_PROTOTYPES.items():
	similarity = cosine_similarity(user_embedding, prototype_vec)
	normalized = max(0.0, min(1.0, (similarity + 1) / 2))
	scores[region_name] = round(normalized, 4)

	if not scores:
	return "Bilinmiyor", {}

	predicted_region = max(scores, key=scores.get)
	return predicted_region, scores


	def generate_reply_text(region: str) -> str:
	templates = {
	"Karadeniz": "Aaa, sen demek Karadenizlisin! Hızlı ritim ve enerjik ton hemen belli ediyor kendini. 🌊",
	"Doğu Anadolu": "Hmm, Doğu’dan bir hava aldım. Güçlü vurgular ve ağır ritim çok tanıdık. 🏔️",
	"İç Anadolu": "Sende İç Anadolu’nun sakin ve net konuşması var gibi. Rahat ve dengeli. 🚜",
	"Ege": "Ege rüzgarı gibi yumuşak tınlıyor sesin; huzur veren bir anlatım. 🌅",
	"Akdeniz": "Akdeniz’in sıcaklığı ve enerjisi var sesinde, çok hareketli! ☀️",
	"Marmara": "Oldukça dengeli ve şehirli bir ton; Marmara aksanı hissediliyor. 🌆",
	"Güneydoğu Anadolu": "Güneydoğu’nun uzun vurguları ve sıcaklığı geliyor sesinden. 🔥",
	}
	if region in templates:
	return templates[region]
	if region and region != "Bilinmiyor":
	return f"Sesinde {region} bölgesine benzeyen bir tını var. Çok hoş bir karışım yakalamışsın. 🙂"
	return "Şive tahmin edemedim ama sesin oldukça ilgi çekici!"


	def synthesize_elevenlabs(
	text: str,
	speaking_rate: Optional[float] = None,
	pitch: Optional[float] = None
	) -> Optional[str]:
	"""
	Convert reply text into speech using ElevenLabs.
	"""
	if not text:
	return None
	if not elevenlabs_client:
	logger.warning("ElevenLabs client unavailable; cannot synthesize audio.")
	return None

	voice_settings: Dict[str, Any] = {
	"stability": 0.4,
	"similarity_boost": 0.8,
	}
	if speaking_rate is not None:
	voice_settings["speaking_rate"] = speaking_rate
	if pitch is not None:
	voice_settings["pitch"] = pitch

	try:
	audio = elevenlabs_client.text_to_speech.convert(
	voice_id=ELEVENLABS_VOICE_ID,
	model_id=ELEVENLABS_MODEL_ID,
	text=text,
	voice_settings=voice_settings,
	)
	out_path = f"reply_{uuid4().hex}.wav"
	save(audio, out_path)
	return out_path
	except Exception as e:
	logger.error(f"ElevenLabs synthesis failed: {e}")
	return None


	# =========================================
	# AUDIO PROCESSING
	# =========================================
	def process_audio(audio_data: np.ndarray, sample_rate: int) -> Tuple[np.ndarray, int]:
	"""
	Process audio: convert to mono, normalize, resample if needed.

	Args:
	audio_data: Audio signal as numpy array
	sample_rate: Original sample rate

	Returns:
	Processed audio data and sample rate
	"""
	try:
	# Convert stereo to mono if needed
	if len(audio_data.shape) > 1:
	audio_data = librosa.to_mono(audio_data)
	logger.info("Converted stereo to mono")

	# Resample to target rate if needed
	if sample_rate != TARGET_SAMPLE_RATE:
	audio_data = librosa.resample(
	audio_data,
	orig_sr=sample_rate,
	target_sr=TARGET_SAMPLE_RATE
	)
	sample_rate = TARGET_SAMPLE_RATE
	logger.info(f"Resampled to {TARGET_SAMPLE_RATE} Hz")

	# Normalize audio
	audio_data = librosa.util.normalize(audio_data)

	return audio_data, sample_rate
	except Exception as e:
	logger.error(f"Error processing audio: {e}")
	raise ValueError(f"Ses işleme hatası: {e}")


	def validate_audio(audio_data: np.ndarray, sample_rate: int) -> None:
	"""
	Validate audio duration and quality.

	Args:
	audio_data: Audio signal
	sample_rate: Sample rate

	Raises:
	ValueError: If audio is invalid
	"""
	duration = len(audio_data) / sample_rate

	if duration < MIN_AUDIO_DURATION:
	raise ValueError(
	f"Ses süresi en az {MIN_AUDIO_DURATION} saniye olmalı. "
	f"Mevcut süre: {duration:.2f} saniye."
	)

	if len(audio_data) == 0:
	raise ValueError("Ses verisi boş.")


	# =========================================
	# ASR CORE
	# =========================================
	def run_asr(audio_data: np.ndarray, sample_rate: int) -> str:
	"""
	Run Whisper ASR on audio.

	Args:
	audio_data: Processed audio signal
	sample_rate: Sample rate

	Returns:
	Transcription text
	"""
	try:
	# Ensure audio is float32 (Whisper expects fp32 input)
	audio_float = audio_data.astype(np.float32)

	inputs = processor(
	audio_float,
	sampling_rate=sample_rate,
	return_tensors="pt"
	)

	# Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU)
	input_features = inputs.input_features.to(device=DEVICE, dtype=DTYPE)

	with torch.no_grad():
	generated_ids = model.generate(
	input_features,
	max_length=400,
	language="tr",
	task="transcribe"
	)

	hypothesis = processor.batch_decode(
	generated_ids,
	skip_special_tokens=True
	)[0]

	logger.info(f"ASR output: {hypothesis}")
	return hypothesis
	except Exception as e:
	logger.error(f"ASR error: {e}")
	raise ValueError(f"Konuşma tanıma hatası: {e}")


	# =========================================
	# DIALECT ANALYSIS
	# =========================================
	def vowel_shift_score(transcription: str, profile: Dict[str, Any]) -> float:
	"""
	Score vowel shifts in transcription.
	Enhanced scoring based on phonetic patterns.

	Args:
	transcription: ASR transcription
	profile: Dialect profile

	Returns:
	Vowel shift score [0, 1]
	"""
	transcription_lower = transcription.lower()
	shifts = profile["vowel_shifts"]
	total_weight = sum(shifts.values())

	if total_weight == 0:
	return 0.0

	score = 0.0
	text_length = len(transcription_lower)

	for shift_pattern, weight in shifts.items():
	if "→" not in shift_pattern:
	continue

	source, target = shift_pattern.split("→")

	# Count occurrences of target vowel/diphthong
	# For elongated vowels (ee, oo, aa), look for repeated patterns
	if len(target) > 1 and target[0] == target[1]:
	# Look for elongated vowels
	pattern = target[0] * 2
	count = transcription_lower.count(pattern)
	# Also check for common elongated patterns in Turkish
	count += transcription_lower.count(target[0] + "ğ")
	count += transcription_lower.count(target[0] + "y")
	else:
	count = transcription_lower.count(target)

	# Normalize by text length
	normalized_count = count / max(text_length, 1) * 100
	score += normalized_count * weight

	# Normalize by total weight
	normalized_score = score / (total_weight * 10 + 1e-6)
	return min(normalized_score, 1.0)


	def marker_score(transcription: str, profile: Dict[str, Any]) -> float:
	"""
	Score lexical markers in transcription.

	Args:
	transcription: ASR transcription
	profile: Dialect profile

	Returns:
	Marker score [0, 1]
	"""
	transcription_lower = transcription.lower()
	markers = profile["markers"]

	if not markers:
	return 0.0

	matches = sum(1 for marker in markers if marker in transcription_lower)

	# Score based on proportion of markers found
	score = matches / len(markers)

	# Bonus for multiple occurrences
	total_occurrences = sum(transcription_lower.count(marker) for marker in markers)
	if total_occurrences > len(markers):
	score = min(score * 1.2, 1.0)

	return score


	def prosody_score(
	audio_data: np.ndarray,
	sample_rate: int,
	profile: Dict[str, Any]
	) -> float:
	"""
	Analyze prosody: tempo, pitch characteristics.

	Args:
	audio_data: Audio signal
	sample_rate: Sample rate
	profile: Dialect profile

	Returns:
	Prosody score [0, 1]
	"""
	try:
	# Normalize audio
	audio_normalized = librosa.util.normalize(audio_data)

	# Tempo analysis
	tempo = float(librosa.beat.tempo(y=audio_normalized, sr=sample_rate)[0])

	# Pitch analysis (fundamental frequency)
	pitches, magnitudes = librosa.piptrack(
	y=audio_normalized,
	sr=sample_rate,
	threshold=0.1
	)

	# Get pitch statistics
	pitch_values = []
	for t in range(pitches.shape[1]):
	index = magnitudes[:, t].argmax()
	pitch = pitches[index, t]
	if pitch > 0:
	pitch_values.append(pitch)

	avg_pitch = np.mean(pitch_values) if pitch_values else 0.0
	pitch_std = np.std(pitch_values) if len(pitch_values) > 1 else 0.0

	# Score based on profile characteristics
	prosody_type = profile["prosody"]
	tempo_range = profile.get("tempo_range", (80, 120))
	pitch_range_type = profile.get("pitch_range", "neutral")

	# Tempo scoring
	tempo_min, tempo_max = tempo_range
	if tempo_min <= tempo <= tempo_max:
	tempo_score = 1.0
	else:
	# Distance from range
	if tempo < tempo_min:
	tempo_score = max(0.0, tempo / tempo_min)
	else:
	tempo_score = max(0.0, 1.0 - (tempo - tempo_max) / tempo_max)

	# Pitch scoring based on profile
	pitch_score = 0.5 # default
	if pitch_range_type == "high" or pitch_range_type == "high-oscillating":
	if avg_pitch > 200:
	pitch_score = 1.0
	elif avg_pitch > 150:
	pitch_score = 0.7
	elif pitch_range_type == "low" or pitch_range_type == "low-elongated":
	if avg_pitch < 150:
	pitch_score = 1.0
	elif avg_pitch < 200:
	pitch_score = 0.7
	else: # neutral
	if 150 <= avg_pitch <= 250:
	pitch_score = 1.0

	# Oscillation scoring (for Karadeniz)
	oscillation_score = 0.5
	if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type:
	if pitch_std > 50:
	oscillation_score = 1.0
	elif pitch_std > 30:
	oscillation_score = 0.7

	# Combine scores
	if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type:
	final_score = (tempo_score * 0.4 + pitch_score * 0.3 + oscillation_score * 0.3)
	else:
	final_score = (tempo_score * 0.6 + pitch_score * 0.4)

	return min(final_score, 1.0)
	except Exception as e:
	logger.warning(f"Prosody analysis error: {e}")
	return 0.5 # Default neutral score


	def dialect_similarity(
	transcription: str,
	audio_data: np.ndarray,
	sample_rate: int
	) -> Tuple[Dict[str, float], List[Tuple[str, float]]]:
	"""
	Calculate dialect similarity scores for all regions.

	Args:
	transcription: ASR transcription
	audio_data: Audio signal
	sample_rate: Sample rate

	Returns:
	Dictionary of scores and sorted predictions
	"""
	scores: Dict[str, float] = {}

	for region, profile in DIALECT_PROFILES.items():
	try:
	vowel_score = vowel_shift_score(transcription, profile)
	marker_score_val = marker_score(transcription, profile)
	prosody_score_val = prosody_score(audio_data, sample_rate, profile)

	# Weighted combination
	combined_score = (
	vowel_score * VOWEL_SHIFT_WEIGHT +
	marker_score_val * MARKER_WEIGHT +
	prosody_score_val * PROSODY_WEIGHT
	)

	scores[region] = round(combined_score, 3)

	logger.info(
	f"{region}: vowel={vowel_score:.3f}, "
	f"marker={marker_score_val:.3f}, "
	f"prosody={prosody_score_val:.3f}, "
	f"combined={combined_score:.3f}"
	)
	except Exception as e:
	logger.error(f"Error calculating score for {region}: {e}")
	scores[region] = 0.0

	# Sort by score
	sorted_predictions = sorted(
	scores.items(),
	key=lambda x: x[1],
	reverse=True
	)

	return scores, sorted_predictions


	# =========================================
	# VISUALIZATION
	# =========================================
	def plot_region_heatmap(
	scores: Dict[str, float],
	highlight_region: Optional[str] = None
	) -> go.Figure:
	"""
	Create an interactive choropleth-style region heatmap for Türkiye dialect scores.
	"""
	try:
	if not scores:
	raise ValueError("Score verisi yok")

	df = pd.DataFrame({
	"region_name": list(scores.keys()),
	"score": list(scores.values()),
	})

	min_score = float(df["score"].min())
	max_score = float(df["score"].max())
	if min_score == max_score:
	max_score = min_score + 0.01

	fig = px.choropleth_mapbox(
	df,
	geojson=TURKEY_REGIONS_GEOJSON,
	locations="region_name",
	featureidkey="properties.name",
	color="score",
	color_continuous_scale="OrRd",
	range_color=(min_score, max_score),
	mapbox_style="carto-positron",
	zoom=4.5,
	center={"lat": 39.0, "lon": 35.0},
	opacity=0.7,
	labels={"score": "Benzerlik"},
	)

	fig.update_traces(marker_line_width=0.5, marker_line_color="white")

	if highlight_region and highlight_region in df["region_name"].values:
	highlight_df = df[df["region_name"] == highlight_region]
	fig.add_choroplethmapbox(
	geojson=TURKEY_REGIONS_GEOJSON,
	locations=highlight_df["region_name"],
	z=np.ones(len(highlight_df)),
	featureidkey="properties.name",
	colorscale=[[0, "rgba(0,0,0,0)"], [1, "rgba(0,0,0,0)"]],
	showscale=False,
	marker_opacity=0,
	marker_line_width=3,
	marker_line_color="black",
	hovertext=highlight_df["region_name"],
	name="Tahmin",
	)

	fig.add_annotation(
	text=f"🗣 Tahmin: {highlight_region}",
	x=0.5,
	y=0.02,
	xref="paper",
	yref="paper",
	showarrow=False,
	bgcolor="white",
	bordercolor="black",
	borderwidth=1,
	font=dict(size=14),
	)

	fig.update_layout(
	margin=dict(l=10, r=10, t=40, b=10),
	height=600,
	coloraxis_colorbar=dict(title="Benzerlik"),
	)

	return fig
	except Exception as e:
	logger.error(f"Error creating heatmap: {e}")
	fig = go.Figure()
	fig.update_layout(
	title="Harita yüklenemedi",
	height=600
	)
	return fig


	# =========================================
	# MAIN PIPELINE
	# =========================================
	def analyze_and_reply(
	audio_path: Optional[str]
	) -> Tuple[str, str, str, Optional[str], go.Figure]:
	"""
	Full processing pipeline: audio → ASR → dialect analysis → TTS reply.
	"""
	def build_empty_fig(title: str = "Harita yüklenemedi") -> go.Figure:
	fig = go.Figure()
	fig.update_layout(title=title, height=600)
	return fig

	logger.info(f"Received audio_path: {audio_path}")

	if audio_path is None:
	logger.warning("Audio input is None.")
	empty_fig = build_empty_fig()
	return "Ses alınamadı. Lütfen tekrar deneyin.", "", "", None, empty_fig

	# Check if file exists
	if not os.path.exists(audio_path):
	logger.error(f"Audio file does not exist: {audio_path}")
	empty_fig = build_empty_fig()
	return f"Ses dosyası bulunamadı: {audio_path}", "", "", None, empty_fig

	try:
	logger.info(f"Reading audio file: {audio_path}")
	audio_data, sample_rate = sf.read(audio_path)
	logger.info(f"Audio file read successfully. Duration: {len(audio_data)/sample_rate:.2f}s, Sample rate: {sample_rate}Hz")
	if audio_data.ndim > 1:
	audio_data = audio_data.T
	audio_data = librosa.to_mono(audio_data)
	audio_data = np.asarray(audio_data, dtype=np.float32)
	except Exception as e:
	logger.error(f"Error reading audio file: {e}")
	empty_fig = build_empty_fig()
	return f"Ses dosyası okunamadı: {e}", "", "", None, empty_fig

	try:
	processed_audio, processed_sr = process_audio(audio_data, sample_rate)
	validate_audio(processed_audio, processed_sr)
	except ValueError as e:
	logger.error(f"Audio validation error: {e}")
	empty_fig = build_empty_fig()
	return str(e), "", "", None, empty_fig

	try:
	transcript = run_asr(processed_audio, processed_sr)
	logger.info(f"ASR transcript: {transcript}")
	except ValueError as e:
	logger.error(f"ASR error: {e}")
	empty_fig = build_empty_fig()
	return str(e), "", "", None, empty_fig

	# Use transcription-based dialect similarity analysis
	similarity_scores, sorted_predictions = dialect_similarity(
	transcript, processed_audio, processed_sr
	)

	# Also try embedding-based prediction as fallback
	embedding_region, embedding_scores = predict_dialect(audio_path)

	# Always use transcription-based prediction if available (it should always work)
	if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
	# Use transcription-based prediction
	predicted_region = sorted_predictions[0][0]
	scores = similarity_scores
	top_score = sorted_predictions[0][1]
	logger.info(f"Using transcription-based prediction: {predicted_region} (score: {top_score:.4f})")

	# Log top 3 predictions for debugging
	if len(sorted_predictions) >= 3:
	logger.info(f"Top 3 predictions: {[(r, f'{s:.4f}') for r, s in sorted_predictions[:3]]}")
	elif embedding_scores and embedding_region != "Bilinmiyor" and max(embedding_scores.values()) > 0.01:
	# Fallback to embedding-based
	predicted_region = embedding_region
	scores = embedding_scores
	logger.info(f"Using embedding-based prediction: {predicted_region} (score: {max(embedding_scores.values()):.4f})")
	else:
	# Last resort: ensure we always return a region
	if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
	predicted_region = sorted_predictions[0][0]
	scores = similarity_scores
	logger.warning(f"Using transcription-based with low scores: {predicted_region} (score: {sorted_predictions[0][1]:.4f})")
	elif similarity_scores:
	# Use first region from scores even if sorted_predictions is empty
	predicted_region = max(similarity_scores, key=similarity_scores.get)
	scores = similarity_scores
	logger.warning(f"Using first region from scores: {predicted_region}")
	else:
	# Absolute last resort: use first region from DIALECT_PROFILES
	predicted_region = list(DIALECT_PROFILES.keys())[0] if DIALECT_PROFILES else "Bilinmiyor"
	scores = {region: 0.1 for region in DIALECT_PROFILES.keys()} if DIALECT_PROFILES else {}
	logger.error(f"All prediction methods failed, using fallback: {predicted_region}")

	reply_text = generate_reply_text(predicted_region)
	reply_audio_path = synthesize_elevenlabs(reply_text) or None
	heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None)

	return (
	transcript,
	predicted_region,
	reply_text,
	reply_audio_path,
	heatmap_fig
	)


	# =========================================
	# UI — Ultra Modern Apple Glassmorphism Design
	# =========================================
	CSS = """
	* {
	box-sizing: border-box;
	margin: 0;
	padding: 0;
	}

	@keyframes float {
	0%, 100% { transform: translateY(0px); }
	50% { transform: translateY(-10px); }
	}

	@keyframes shimmer {
	0% { background-position: -1000px 0; }
	100% { background-position: 1000px 0; }
	}

	@keyframes pulse {
	0%, 100% { opacity: 1; }
	50% { opacity: 0.7; }
	}

	body {
	background:
	radial-gradient(circle at 20% 50%, rgba(120, 119, 198, 0.15) 0%, transparent 50%),
	radial-gradient(circle at 80% 80%, rgba(255, 119, 198, 0.1) 0%, transparent 50%),
	radial-gradient(circle at 40% 20%, rgba(99, 102, 241, 0.1) 0%, transparent 50%),
	linear-gradient(135deg, #F8F9FA 0%, #E9ECEF 50%, #F1F3F5 100%) !important;
	font-family: -apple-system, BlinkMacSystemFont, "SF Pro Display", "SF Pro Text", "Segoe UI", sans-serif;
	-webkit-font-smoothing: antialiased;
	-moz-osx-font-smoothing: grayscale;
	min-height: 100vh;
	position: relative;
	overflow-x: hidden;
	}

	body::before {
	content: '';
	position: fixed;
	top: 0;
	left: 0;
	right: 0;
	bottom: 0;
	background:
	radial-gradient(circle at 20% 30%, rgba(99, 102, 241, 0.08) 0%, transparent 50%),
	radial-gradient(circle at 80% 70%, rgba(168, 85, 247, 0.06) 0%, transparent 50%);
	pointer-events: none;
	z-index: 0;
	}

	.gradio-container {
	background: transparent !important;
	max-width: 1500px !important;
	margin: 0 auto !important;
	padding: 60px 30px !important;
	position: relative;
	z-index: 1;
	}

	h1 {
	font-weight: 800 !important;
	letter-spacing: -2.5px !important;
	color: #1D1D1F !important;
	margin: 0 !important;
	background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%) !important;
	background-size: 200% auto !important;
	-webkit-background-clip: text !important;
	-webkit-text-fill-color: #1D1D1F !important;
	background-clip: text !important;
	animation: shimmer 3s linear infinite !important;
	opacity: 1 !important;
	z-index: 10 !important;
	position: relative !important;
	visibility: visible !important;
	}

	.card {
	background: rgba(255, 255, 255, 0.85) !important;
	backdrop-filter: blur(30px) saturate(180%) !important;
	-webkit-backdrop-filter: blur(30px) saturate(180%) !important;
	padding: 28px !important;
	border-radius: 20px !important;
	border: 1px solid rgba(0, 0, 0, 0.08) !important;
	margin-bottom: 20px !important;
	box-shadow:
	0 8px 32px rgba(0, 0, 0, 0.06),
	0 4px 16px rgba(0, 0, 0, 0.04),
	0 2px 8px rgba(0, 0, 0, 0.03),
	inset 0 1px 0 rgba(255, 255, 255, 0.9) !important;
	transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
	position: relative;
	overflow: hidden;
	}

	.card::before {
	content: '';
	position: absolute;
	top: 0;
	left: -100%;
	width: 100%;
	height: 100%;
	background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.4), transparent);
	transition: left 0.5s;
	}

	.card:hover::before {
	left: 100%;
	}

	.card:hover {
	transform: translateY(-4px) scale(1.01) !important;
	box-shadow:
	0 28px 80px rgba(0, 0, 0, 0.12),
	0 12px 32px rgba(0, 0, 0, 0.08),
	0 4px 12px rgba(0, 0, 0, 0.06),
	inset 0 1px 0 rgba(255, 255, 255, 1),
	inset 0 -1px 0 rgba(255, 255, 255, 0.6) !important;
	border-color: rgba(255, 255, 255, 1) !important;
	}

	.label {
	font-weight: 700 !important;
	color: #1D1D1F !important;
	margin-bottom: 14px !important;
	font-size: 15px !important;
	letter-spacing: -0.3px !important;
	text-transform: uppercase;
	font-size: 12px;
	opacity: 0.8;
	}

	.textbox textarea,
	.textbox input,
	.dropdown select {
	background: rgba(255, 255, 255, 0.95) !important;
	backdrop-filter: blur(20px) saturate(180%) !important;
	-webkit-backdrop-filter: blur(20px) saturate(180%) !important;
	border: 1.5px solid rgba(0, 0, 0, 0.06) !important;
	border-radius: 16px !important;
	color: #1D1D1F !important;
	padding: 16px 20px !important;
	font-size: 15px !important;
	font-weight: 500 !important;
	transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
	box-shadow:
	0 4px 12px rgba(0, 0, 0, 0.04),
	inset 0 1px 2px rgba(0, 0, 0, 0.02) !important;
	}

	.textbox:focus-within,
	.dropdown:focus-within {
	border-color: #007AFF !important;
	box-shadow:
	0 8px 24px rgba(0, 122, 255, 0.2),
	0 4px 12px rgba(0, 122, 255, 0.15),
	inset 0 1px 2px rgba(0, 122, 255, 0.1) !important;
	transform: translateY(-1px);
	}

	button.primary {
	background: linear-gradient(135deg, #007AFF 0%, #0051D5 50%, #007AFF 100%) !important;
	background-size: 200% auto !important;
	border: none !important;
	border-radius: 18px !important;
	font-weight: 700 !important;
	padding: 18px 40px !important;
	font-size: 17px !important;
	color: white !important;
	letter-spacing: -0.2px !important;
	box-shadow:
	0 8px 24px rgba(0, 122, 255, 0.4),
	0 4px 12px rgba(0, 122, 255, 0.3),
	inset 0 1px 0 rgba(255, 255, 255, 0.3),
	inset 0 -1px 0 rgba(0, 0, 0, 0.1) !important;
	transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1) !important;
	cursor: pointer !important;
	position: relative;
	overflow: hidden;
	text-transform: none !important;
	}

	button.primary::before {
	content: '';
	position: absolute;
	top: 0;
	left: -100%;
	width: 100%;
	height: 100%;
	background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.3), transparent);
	transition: left 0.5s;
	}

	button.primary:hover::before {
	left: 100%;
	}

	button.primary:hover {
	transform: translateY(-3px) scale(1.02) !important;
	box-shadow:
	0 12px 32px rgba(0, 122, 255, 0.5),
	0 6px 16px rgba(0, 122, 255, 0.4),
	inset 0 1px 0 rgba(255, 255, 255, 0.4),
	inset 0 -1px 0 rgba(0, 0, 0, 0.15) !important;
	background-position: right center !important;
	}

	button.primary:active {
	transform: translateY(-1px) scale(1.01) !important;
	box-shadow:
	0 4px 16px rgba(0, 122, 255, 0.4),
	inset 0 1px 0 rgba(255, 255, 255, 0.2) !important;
	}

	.json {
	font-family: "SF Mono", "Monaco", "Menlo", "Courier New", monospace !important;
	font-size: 13px !important;
	background: rgba(248, 249, 250, 0.9) !important;
	backdrop-filter: blur(20px) saturate(180%) !important;
	-webkit-backdrop-filter: blur(20px) saturate(180%) !important;
	border: 1px solid rgba(0, 0, 0, 0.05) !important;
	border-radius: 16px !important;
	padding: 24px !important;
	color: #1D1D1F !important;
	line-height: 1.7 !important;
	box-shadow:
	inset 0 2px 8px rgba(0, 0, 0, 0.03),
	inset 0 1px 2px rgba(0, 0, 0, 0.02) !important;
	}

	.markdown {
	color: #1D1D1F !important;
	}

	.markdown * {
	visibility: visible !important;
	opacity: 1 !important;
	display: block !important;
	}

	.markdown div {
	display: block !important;
	visibility: visible !important;
	opacity: 1 !important;
	color: inherit !important;
	}

	.markdown h1, .header-markdown h1, .main-title {
	color: #1D1D1F !important;
	margin-bottom: 16px !important;
	margin-top: 50px !important;
	font-size: 3.5rem !important;
	font-weight: 800 !important;
	letter-spacing: -2px !important;
	line-height: 1.2 !important;
	text-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
	-webkit-text-fill-color: #1D1D1F !important;
	background: none !important;
	background-image: none !important;
	opacity: 1 !important;
	z-index: 10 !important;
	position: relative !important;
	visibility: visible !important;
	display: block !important;
	text-align: center !important;
	}

	.header-markdown {
	text-align: center !important;
	}

	.header-markdown p {
	color: #6E6E73 !important;
	font-size: 1.15rem !important;
	margin-top: 8px !important;
	opacity: 0.9 !important;
	}

	.markdown h1 span {
	color: #1D1D1F !important;
	-webkit-text-fill-color: #1D1D1F !important;
	background: none !important;
	display: inline-block !important;
	}

	.markdown p {
	display: block !important;
	visibility: visible !important;
	opacity: 1 !important;
	color: #6E6E73 !important;
	margin: 0 !important;
	}

	.instruction-text {
	display: block !important;
	visibility: visible !important;
	text-align: center !important;
	margin-top: -20px !important;
	margin-bottom: 40px !important;
	color: #6E6E73 !important;
	font-size: 1.1rem !important;
	opacity: 0.9 !important;
	padding: 0 20px !important;
	}

	.instruction-text p {
	display: block !important;
	visibility: visible !important;
	opacity: 1 !important;
	color: #6E6E73 !important;
	margin: 0 !important;
	}

	.header-container {
	display: block !important;
	visibility: visible !important;
	opacity: 1 !important;
	}

	.header-container h1 {
	display: block !important;
	visibility: visible !important;
	opacity: 1 !important;
	}

	.header-container p {
	display: block !important;
	visibility: visible !important;
	opacity: 1 !important;
	}

	/* HTML component styles */
	.html-component, .html-component * {
	display: block !important;
	visibility: visible !important;
	opacity: 1 !important;
	}

	.html-component h1, .html-component .main-title {
	color: #1D1D1F !important;
	-webkit-text-fill-color: #1D1D1F !important;
	background: none !important;
	background-image: none !important;
	display: block !important;
	}

	.html-component p {
	display: block !important;
	visibility: visible !important;
	color: #6E6E73 !important;
	}

	.markdown p {
	color: #6E6E73;
	opacity: 0.95;
	font-size: 1.15rem;
	font-weight: 400;
	line-height: 1.6;
	letter-spacing: -0.2px;
	}

	.audio-component {
	background: rgba(255, 255, 255, 0.95) !important;
	backdrop-filter: blur(30px) saturate(200%) !important;
	-webkit-backdrop-filter: blur(30px) saturate(200%) !important;
	border-radius: 20px !important;
	border: 1.5px solid rgba(255, 255, 255, 0.8) !important;
	padding: 20px !important;
	box-shadow:
	0 8px 24px rgba(0, 0, 0, 0.06),
	0 4px 12px rgba(0, 0, 0, 0.04),
	inset 0 1px 0 rgba(255, 255, 255, 0.9) !important;
	transition: all 0.3s ease !important;
	}

	.audio-component:hover {
	box-shadow:
	0 12px 32px rgba(0, 0, 0, 0.08),
	0 6px 16px rgba(0, 0, 0, 0.06),
	inset 0 1px 0 rgba(255, 255, 255, 1) !important;
	}

	/* Ultra smooth scrollbar */
	::-webkit-scrollbar {
	width: 10px;
	height: 10px;
	}

	::-webkit-scrollbar-track {
	background: rgba(0, 0, 0, 0.02);
	border-radius: 10px;
	}

	::-webkit-scrollbar-thumb {
	background: linear-gradient(135deg, rgba(0, 122, 255, 0.3), rgba(0, 81, 213, 0.4));
	border-radius: 10px;
	border: 2px solid transparent;
	background-clip: padding-box;
	}

	::-webkit-scrollbar-thumb:hover {
	background: linear-gradient(135deg, rgba(0, 122, 255, 0.5), rgba(0, 81, 213, 0.6));
	background-clip: padding-box;
	}

	/* Loading animation */
	@keyframes spin {
	from { transform: rotate(0deg); }
	to { transform: rotate(360deg); }
	}

	/* Enhanced focus states */
	*:focus-visible {
	outline: 2px solid #007AFF;
	outline-offset: 2px;
	border-radius: 4px;
	}
	"""


	def build_ui() -> gr.Blocks:
	"""
	Build Gradio UI with Apple minimal white + smooth glass design.

	Returns:
	Gradio Blocks interface
	"""
	with gr.Blocks(
	css=CSS,
	fill_height=True,
	theme=gr.themes.Soft()
	) as demo:

	gr.Markdown(
	"""
	# 🇹🇷 Dialect Intelligence Engine

	Powered by Meta Omnilingual ASR & Whisper Large-v3
	""",
	elem_classes="header-markdown"
	)

	gr.Markdown(
	"""
	Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin.
	""",
	elem_classes="instruction-text"
	)

	with gr.Row(equal_height=False):
	with gr.Column(scale=1, min_width=400):
	audio_input = gr.Audio(
	sources=["microphone", "upload"],
	type="filepath",
	label="🎤 Mikrofona basın, konuşun, kaydı durdurun",
	show_label=True,
	interactive=True,
	elem_classes="card"
	)
	analyze_button = gr.Button(
	"🔍 Analiz Et ve Şive Tahmini Yap",
	variant="primary",
	elem_classes="primary",
	visible=True,
	scale=1
	)
	gr.Markdown(
	"📝 Ses kaydını tamamladıktan sonra butona tıklayın",
	elem_classes="instruction-text"
	)

	with gr.Column(scale=2, min_width=600):
	transcript_output = gr.Textbox(
	label="Transcript",
	lines=4,
	interactive=False,
	placeholder="Konuşmanı bekliyorum...",
	elem_classes="card"
	)

	with gr.Row():
	predicted_dialect = gr.Textbox(
	label="Tahmin Edilen Bölge",
	interactive=False,
	lines=2,
	elem_classes="card"
	)
	reply_text_output = gr.Textbox(
	label="Model Cevabı (Metin)",
	interactive=False,
	lines=2,
	elem_classes="card"
	)

	reply_audio_output = gr.Audio(
	label="Model Cevabı (Ses)",
	type="filepath",
	interactive=False,
	autoplay=True,
	elem_classes="card"
	)

	region_map = gr.Plot(
	label="Bölgesel Harita Isı Dağılımı",
	elem_classes="card"
	)

	def build_empty_fig_ui():
	"""Build empty figure for UI"""
	fig = go.Figure()
	fig.update_layout(title="Harita yüklenemedi", height=600)
	return fig

	def analyze_and_reply_with_autoplay(audio_path):
	"""Wrapper to ensure audio autoplays after generation"""
	logger.info(f"analyze_and_reply_with_autoplay called with audio_path: {audio_path}")
	if audio_path is None:
	logger.warning("audio_path is None in wrapper")
	empty_fig = build_empty_fig_ui()
	return "", "", "", None, empty_fig
	result = analyze_and_reply(audio_path)
	# Return result - Gradio will handle autoplay if autoplay=True is set
	return result

	# Both button click and audio change trigger analysis
	analyze_button.click(
	fn=analyze_and_reply_with_autoplay,
	inputs=audio_input,
	outputs=[
	transcript_output,
	predicted_dialect,
	reply_text_output,
	reply_audio_output,
	region_map
	]
	)

	# Also trigger on change (for file uploads and when recording stops)
	audio_input.change(
	fn=analyze_and_reply_with_autoplay,
	inputs=audio_input,
	outputs=[
	transcript_output,
	predicted_dialect,
	reply_text_output,
	reply_audio_output,
	region_map
	]
	)

	# Add JavaScript for autoplay
	demo.load(
	fn=None,
	js="""
	function() {
	// Auto-play audio when it's updated
	const observer = new MutationObserver(function(mutations) {
	mutations.forEach(function(mutation) {
	mutation.addedNodes.forEach(function(node) {
	if (node.nodeType === 1) {
	const audio = node.querySelector('audio');
	if (audio && audio.src && !audio.hasAttribute('data-autoplayed')) {
	audio.setAttribute('data-autoplayed', 'true');
	audio.play().catch(e => console.log('Autoplay prevented:', e));
	}
	}
	});
	});
	});

	observer.observe(document.body, {
	childList: true,
	subtree: true
	});
	}
	"""
	)

	# Auto-play audio when it's generated using JavaScript callback
	reply_audio_output.change(
	fn=None,
	inputs=None,
	outputs=None,
	js="""
	function() {
	setTimeout(function() {
	// Find the audio element by looking for the reply audio component
	const labels = Array.from(document.querySelectorAll('label'));
	const replyLabel = labels.find(label =>
	label.textContent && label.textContent.includes('Model Cevabı (Ses)')
	);

	if (replyLabel) {
	const audioContainer = replyLabel.closest('.card') \|\| replyLabel.parentElement;
	const audioElement = audioContainer ? audioContainer.querySelector('audio') : null;

	if (audioElement && audioElement.src) {
	// Reset and play
	audioElement.currentTime = 0;
	const playPromise = audioElement.play();
	if (playPromise !== undefined) {
	playPromise.catch(function(error) {
	console.log('Autoplay prevented by browser:', error);
	});
	}
	}
	}
	}, 800); // Wait for audio to be fully loaded
	return [];
	}
	"""
	)

	return demo


	# =========================================
	# MAIN
	# =========================================
	demo = build_ui()

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	ssr_mode=False # Fix for HF Spaces microphone bug
	)