Spaces:

nisacayir
/

dialect-map-turkiye

Paused

File size: 50,262 Bytes

# =========================================
# ENV FIXES
# =========================================
import os

os.environ["OMP_NUM_THREADS"] = "1"  # libgomp hatası fix

# =========================================
# IMPORTS
# =========================================
import logging
from pathlib import Path
from typing import Dict, Tuple, List, Optional, Any
from collections import defaultdict
from uuid import uuid4

import numpy as np
import librosa
import soundfile as sf
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoFeatureExtractor, AutoModel
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

import gradio as gr
from elevenlabs import ElevenLabs, save

from regions_geojson import TURKEY_REGIONS_GEOJSON

# =========================================
# LOGGING SETUP
# =========================================
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# =========================================
# CONSTANTS
# =========================================
MODEL_ID = "openai/whisper-large-v3"
MIN_AUDIO_DURATION = 3.0  # seconds
VOWEL_SHIFT_WEIGHT = 0.35
MARKER_WEIGHT = 0.40
PROSODY_WEIGHT = 0.25
FAST_TEMPO_THRESHOLD = 140.0
SLOW_TEMPO_THRESHOLD = 80.0
TARGET_SAMPLE_RATE = 16000
EMBED_MODEL_ID = "facebook/wav2vec2-large-xlsr-53"  # Turkish finetuning otomatik algılanır
EMBED_SAMPLE_RATE = 16000
DIALECT_REF_DIR = Path("data/dialects")
ELEVENLABS_VOICE_ID = "Q5n6GDIjpN0pLOlycRFT"
ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"

# =========================================
# DEVICE CONFIGURATION
# =========================================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32

logger.info(f"Using device: {DEVICE}, dtype: {DTYPE}")

# =========================================
# MODEL INITIALIZATION
# =========================================
try:
    processor = AutoProcessor.from_pretrained(MODEL_ID)

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        MODEL_ID,
        torch_dtype=DTYPE
    )

    model = model.to(DEVICE)
    model.eval()

    logger.info("Model loaded successfully")
except Exception as e:
    logger.error(f"Error loading model: {e}")
    raise

# =========================================
# EMBEDDING MODEL INITIALIZATION
# Note: Embedding model is disabled - we use transcription-based dialect analysis instead
# =========================================
embed_feature_extractor = None
embed_model = None
logger.debug("Embedding model disabled - using transcription-based analysis only")

DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = defaultdict(list)
DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {}

# =========================================
# ELEVENLABS CLIENT
# =========================================
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
if ELEVENLABS_API_KEY:
    try:
        elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
        logger.info("ElevenLabs client initialized")
    except Exception as e:
        elevenlabs_client = None
        logger.warning(f"Failed to initialize ElevenLabs client: {e}")
else:
    elevenlabs_client = None
    logger.warning("ELEVENLABS_API_KEY not found. Voice replies will be disabled.")
# =========================================
# DIALECT PROFILES
# =========================================
DIALECT_PROFILES: Dict[str, Dict[str, Any]] = {
    "Marmara": {
        "description": "İstanbul-Trakya şehir aksanı, düz prosodi.",
        "vowel_shifts": {"a→ı": 0.1, "ı→i": 0.15, "e→i": 0.15, "o→u": 0.1},
        "markers": ["abi", "aynen", "bi şey dicem", "baksana"],
        "prosody": "düz-orta",
        "tempo_range": (100, 130),
        "pitch_range": "neutral"
    },
    "Ege": {
        "description": "Melodik, uzatmalı, 'gari', 'hee' kültürüne sahip.",
        "vowel_shifts": {"e→ee": 0.85, "o→oo": 0.75, "a→aa": 0.4},
        "markers": ["gari", "hee", "ebe", "söyleyiver"],
        "prosody": "yavaş-uzatmalı",
        "tempo_range": (60, 90),
        "pitch_range": "medium"
    },
    "Akdeniz": {
        "description": "Hızlı, enerjik, 'la' baskın aksan.",
        "vowel_shifts": {"a→aa": 0.65, "ı→i": 0.35},
        "markers": ["la", "naapıyon la", "hee la"],
        "prosody": "enerjik-hızlı",
        "tempo_range": (130, 160),
        "pitch_range": "high"
    },
    "İç Anadolu": {
        "description": "Düz ritmik, ı/i kaymaları belirgin.",
        "vowel_shifts": {"ı→i": 0.7, "a→ı": 0.5, "o→u": 0.3},
        "markers": ["gelisen", "gideceksen", "hele bi dur"],
        "prosody": "düz-ritmik",
        "tempo_range": (100, 125),
        "pitch_range": "neutral"
    },
    "Karadeniz": {
        "description": "Yüksek tonlama, hızlı, ünlü daralması.",
        "vowel_shifts": {"e→i": 0.9, "ö→u": 0.8, "a→e": 0.3},
        "markers": ["ha bu", "da gel daa", "nere gideysin"],
        "prosody": "yüksek-inişli-çıkışlı",
        "tempo_range": (120, 150),
        "pitch_range": "high-oscillating"
    },
    "Doğu Anadolu": {
        "description": "Ağır tempo, geniş ünlü uzatmaları.",
        "vowel_shifts": {"ı→i": 0.75, "u→o": 0.65, "a→â": 0.4},
        "markers": ["he vallah", "gardaş", "ağabey"],
        "prosody": "düşük-ağır",
        "tempo_range": (70, 100),
        "pitch_range": "low"
    },
    "Güneydoğu Anadolu": {
        "description": "Ê/Î uzatmaları, uzun vurgu, ağır tempo.",
        "vowel_shifts": {"a→ê": 0.9, "e→ê": 0.95, "i→î": 0.6},
        "markers": ["ê", "hele", "gardaş", "bacı"],
        "prosody": "uzun-vurgulu-ağır",
        "tempo_range": (65, 95),
        "pitch_range": "low-elongated"
    }
}

# =========================================
# DIALECT EMBEDDING HELPERS
# =========================================
REGION_ALIAS_MAP = {
    region.lower().replace(" ", ""): region
    for region in DIALECT_PROFILES.keys()
}


def _resolve_region_from_name(name: str) -> Optional[str]:
    key = (
        name.lower()
        .replace("-", "")
        .replace("_", "")
        .replace(" ", "")
    )
    if key in REGION_ALIAS_MAP:
        return REGION_ALIAS_MAP[key]
    for alias_key, region_name in REGION_ALIAS_MAP.items():
        if alias_key in key or key in alias_key:
            return region_name
    return None


def embed_audio(audio_path: str) -> Optional[np.ndarray]:
    """
    Convert an audio file into a fixed-length embedding vector.
    """
    if embed_model is None or embed_feature_extractor is None:
        logger.warning("Embedding model unavailable; cannot embed audio.")
        return None
    try:
        audio_data, sr = sf.read(audio_path)
        if audio_data.ndim > 1:
            audio_data = np.mean(audio_data, axis=1)
        if sr != EMBED_SAMPLE_RATE:
            audio_data = librosa.resample(
                audio_data,
                orig_sr=sr,
                target_sr=EMBED_SAMPLE_RATE
            )
            sr = EMBED_SAMPLE_RATE

        inputs = embed_feature_extractor(
            audio_data,
            sampling_rate=sr,
            return_tensors="pt"
        )
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = embed_model(**inputs)
            hidden_states = outputs.last_hidden_state
            embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()
        return embedding
    except Exception as e:
        logger.error(f"Audio embedding failed: {e}")
        return None


def load_reference_embeddings() -> Tuple[Dict[str, List[np.ndarray]], Dict[str, np.ndarray]]:
    """
    Load reference embeddings for each dialect region from local wav files.
    """
    # Check if embedding model is available (globally defined)
    try:
        if embed_model is None or embed_feature_extractor is None:
            logger.warning("Embedding model missing; reference embeddings disabled.")
            return {}, {}
    except NameError:
        # embed_model not defined - embedding model disabled
        logger.debug("Embedding model not defined; reference embeddings disabled.")
        return {}, {}

    if not DIALECT_REF_DIR.exists():
        logger.warning(f"Dialect reference directory not found: {DIALECT_REF_DIR}")
        return {}, {}

    embeddings: Dict[str, List[np.ndarray]] = defaultdict(list)
    for wav_path in sorted(DIALECT_REF_DIR.glob("*.wav")):
        region_name = _resolve_region_from_name(wav_path.stem)
        if not region_name:
            logger.debug(f"Could not resolve region for reference file {wav_path.name}")
            continue
        emb = embed_audio(str(wav_path))
        if emb is not None:
            embeddings[region_name].append(emb)

    prototypes: Dict[str, np.ndarray] = {}
    for region_name, vectors in embeddings.items():
        if vectors:
            prototypes[region_name] = np.mean(vectors, axis=0)
            logger.info(f"Loaded {len(vectors)} reference embeddings for {region_name}")

    if not prototypes:
        logger.warning("No dialect reference prototypes were built.")

    return embeddings, prototypes


# Embedding model disabled - reference embeddings not needed
DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = {}
DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {}
logger.debug("Embedding model disabled - skipping reference embeddings loading")


# =========================================
# ZERO-SHOT DIALECT CLASSIFIER
# =========================================
def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float:
    denom = (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)) + 1e-10
    return float(np.dot(vec_a, vec_b) / denom)


def predict_dialect(audio_path: str) -> Tuple[str, Dict[str, float]]:
    """
    Predict dialect region using cosine similarity against reference prototypes.
    """
    if not DIALECT_PROTOTYPES:
        logger.warning("No dialect prototypes available; returning fallback prediction.")
        return "Bilinmiyor", {}

    user_embedding = embed_audio(audio_path)
    if user_embedding is None:
        return "Bilinmiyor", {}

    scores: Dict[str, float] = {}
    for region_name, prototype_vec in DIALECT_PROTOTYPES.items():
        similarity = cosine_similarity(user_embedding, prototype_vec)
        normalized = max(0.0, min(1.0, (similarity + 1) / 2))
        scores[region_name] = round(normalized, 4)

    if not scores:
        return "Bilinmiyor", {}

    predicted_region = max(scores, key=scores.get)
    return predicted_region, scores


def generate_reply_text(region: str) -> str:
    templates = {
        "Karadeniz": "Aaa, sen demek Karadenizlisin! Hızlı ritim ve enerjik ton hemen belli ediyor kendini. 🌊",
        "Doğu Anadolu": "Hmm, Doğu’dan bir hava aldım. Güçlü vurgular ve ağır ritim çok tanıdık. 🏔️",
        "İç Anadolu": "Sende İç Anadolu’nun sakin ve net konuşması var gibi. Rahat ve dengeli. 🚜",
        "Ege": "Ege rüzgarı gibi yumuşak tınlıyor sesin; huzur veren bir anlatım. 🌅",
        "Akdeniz": "Akdeniz’in sıcaklığı ve enerjisi var sesinde, çok hareketli! ☀️",
        "Marmara": "Oldukça dengeli ve şehirli bir ton; Marmara aksanı hissediliyor. 🌆",
        "Güneydoğu Anadolu": "Güneydoğu’nun uzun vurguları ve sıcaklığı geliyor sesinden. 🔥",
    }
    if region in templates:
        return templates[region]
    if region and region != "Bilinmiyor":
        return f"Sesinde {region} bölgesine benzeyen bir tını var. Çok hoş bir karışım yakalamışsın. 🙂"
    return "Şive tahmin edemedim ama sesin oldukça ilgi çekici!"


def synthesize_elevenlabs(
    text: str,
    speaking_rate: Optional[float] = None,
    pitch: Optional[float] = None
) -> Optional[str]:
    """
    Convert reply text into speech using ElevenLabs.
    """
    if not text:
        return None
    if not elevenlabs_client:
        logger.warning("ElevenLabs client unavailable; cannot synthesize audio.")
        return None

    voice_settings: Dict[str, Any] = {
        "stability": 0.4,
        "similarity_boost": 0.8,
    }
    if speaking_rate is not None:
        voice_settings["speaking_rate"] = speaking_rate
    if pitch is not None:
        voice_settings["pitch"] = pitch

    try:
        audio = elevenlabs_client.text_to_speech.convert(
            voice_id=ELEVENLABS_VOICE_ID,
            model_id=ELEVENLABS_MODEL_ID,
            text=text,
            voice_settings=voice_settings,
        )
        out_path = f"reply_{uuid4().hex}.wav"
        save(audio, out_path)
        return out_path
    except Exception as e:
        logger.error(f"ElevenLabs synthesis failed: {e}")
        return None


# =========================================
# AUDIO PROCESSING
# =========================================
def process_audio(audio_data: np.ndarray, sample_rate: int) -> Tuple[np.ndarray, int]:
    """
    Process audio: convert to mono, normalize, resample if needed.
    
    Args:
        audio_data: Audio signal as numpy array
        sample_rate: Original sample rate
        
    Returns:
        Processed audio data and sample rate
    """
    try:
        # Convert stereo to mono if needed
        if len(audio_data.shape) > 1:
            audio_data = librosa.to_mono(audio_data)
            logger.info("Converted stereo to mono")
        
        # Resample to target rate if needed
        if sample_rate != TARGET_SAMPLE_RATE:
            audio_data = librosa.resample(
                audio_data,
                orig_sr=sample_rate,
                target_sr=TARGET_SAMPLE_RATE
            )
            sample_rate = TARGET_SAMPLE_RATE
            logger.info(f"Resampled to {TARGET_SAMPLE_RATE} Hz")
        
        # Normalize audio
        audio_data = librosa.util.normalize(audio_data)
        
        return audio_data, sample_rate
    except Exception as e:
        logger.error(f"Error processing audio: {e}")
        raise ValueError(f"Ses işleme hatası: {e}")


def validate_audio(audio_data: np.ndarray, sample_rate: int) -> None:
    """
    Validate audio duration and quality.
    
    Args:
        audio_data: Audio signal
        sample_rate: Sample rate
        
    Raises:
        ValueError: If audio is invalid
    """
    duration = len(audio_data) / sample_rate
    
    if duration < MIN_AUDIO_DURATION:
        raise ValueError(
            f"Ses süresi en az {MIN_AUDIO_DURATION} saniye olmalı. "
            f"Mevcut süre: {duration:.2f} saniye."
        )
    
    if len(audio_data) == 0:
        raise ValueError("Ses verisi boş.")


# =========================================
# ASR CORE
# =========================================
def run_asr(audio_data: np.ndarray, sample_rate: int) -> str:
    """
    Run Whisper ASR on audio.
    
    Args:
        audio_data: Processed audio signal
        sample_rate: Sample rate
        
        Returns:
        Transcription text
        """
    try:
        # Ensure audio is float32 (Whisper expects fp32 input)
        audio_float = audio_data.astype(np.float32)

        inputs = processor(
            audio_float,
            sampling_rate=sample_rate,
            return_tensors="pt"
        )

        # Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU)
        input_features = inputs.input_features.to(device=DEVICE, dtype=DTYPE)

        with torch.no_grad():
            generated_ids = model.generate(
                input_features,
                max_length=400,
                language="tr",
                task="transcribe"
            )

        hypothesis = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True
        )[0]

        logger.info(f"ASR output: {hypothesis}")
        return hypothesis
    except Exception as e:
        logger.error(f"ASR error: {e}")
        raise ValueError(f"Konuşma tanıma hatası: {e}")


# =========================================
# DIALECT ANALYSIS
# =========================================
def vowel_shift_score(transcription: str, profile: Dict[str, Any]) -> float:
    """
    Score vowel shifts in transcription.
    Enhanced scoring based on phonetic patterns.
    
    Args:
        transcription: ASR transcription
        profile: Dialect profile
        
    Returns:
        Vowel shift score [0, 1]
    """
    transcription_lower = transcription.lower()
    shifts = profile["vowel_shifts"]
    total_weight = sum(shifts.values())
    
    if total_weight == 0:
        return 0.0
    
    score = 0.0
    text_length = len(transcription_lower)
    
    for shift_pattern, weight in shifts.items():
        if "→" not in shift_pattern:
            continue
        
        source, target = shift_pattern.split("→")
        
        # Count occurrences of target vowel/diphthong
        # For elongated vowels (ee, oo, aa), look for repeated patterns
        if len(target) > 1 and target[0] == target[1]:
            # Look for elongated vowels
            pattern = target[0] * 2
            count = transcription_lower.count(pattern)
            # Also check for common elongated patterns in Turkish
            count += transcription_lower.count(target[0] + "ğ")
            count += transcription_lower.count(target[0] + "y")
        else:
            count = transcription_lower.count(target)
        
        # Normalize by text length
        normalized_count = count / max(text_length, 1) * 100
        score += normalized_count * weight
    
    # Normalize by total weight
    normalized_score = score / (total_weight * 10 + 1e-6)
    return min(normalized_score, 1.0)


def marker_score(transcription: str, profile: Dict[str, Any]) -> float:
    """
    Score lexical markers in transcription.
    
    Args:
        transcription: ASR transcription
        profile: Dialect profile
        
    Returns:
        Marker score [0, 1]
    """
    transcription_lower = transcription.lower()
    markers = profile["markers"]
    
    if not markers:
        return 0.0
    
    matches = sum(1 for marker in markers if marker in transcription_lower)
    
    # Score based on proportion of markers found
    score = matches / len(markers)
    
    # Bonus for multiple occurrences
    total_occurrences = sum(transcription_lower.count(marker) for marker in markers)
    if total_occurrences > len(markers):
        score = min(score * 1.2, 1.0)
    
    return score


def prosody_score(
    audio_data: np.ndarray,
    sample_rate: int,
    profile: Dict[str, Any]
) -> float:
    """
    Analyze prosody: tempo, pitch characteristics.
    
    Args:
        audio_data: Audio signal
        sample_rate: Sample rate
        profile: Dialect profile
        
    Returns:
        Prosody score [0, 1]
    """
    try:
        # Normalize audio
        audio_normalized = librosa.util.normalize(audio_data)
        
        # Tempo analysis
        tempo = float(librosa.beat.tempo(y=audio_normalized, sr=sample_rate)[0])
        
        # Pitch analysis (fundamental frequency)
        pitches, magnitudes = librosa.piptrack(
            y=audio_normalized,
            sr=sample_rate,
            threshold=0.1
        )
        
        # Get pitch statistics
        pitch_values = []
        for t in range(pitches.shape[1]):
            index = magnitudes[:, t].argmax()
            pitch = pitches[index, t]
            if pitch > 0:
                pitch_values.append(pitch)
        
        avg_pitch = np.mean(pitch_values) if pitch_values else 0.0
        pitch_std = np.std(pitch_values) if len(pitch_values) > 1 else 0.0
        
        # Score based on profile characteristics
        prosody_type = profile["prosody"]
        tempo_range = profile.get("tempo_range", (80, 120))
        pitch_range_type = profile.get("pitch_range", "neutral")
        
        # Tempo scoring
        tempo_min, tempo_max = tempo_range
        if tempo_min <= tempo <= tempo_max:
            tempo_score = 1.0
        else:
            # Distance from range
            if tempo < tempo_min:
                tempo_score = max(0.0, tempo / tempo_min)
            else:
                tempo_score = max(0.0, 1.0 - (tempo - tempo_max) / tempo_max)
        
        # Pitch scoring based on profile
        pitch_score = 0.5  # default
        if pitch_range_type == "high" or pitch_range_type == "high-oscillating":
            if avg_pitch > 200:
                pitch_score = 1.0
            elif avg_pitch > 150:
                pitch_score = 0.7
        elif pitch_range_type == "low" or pitch_range_type == "low-elongated":
            if avg_pitch < 150:
                pitch_score = 1.0
            elif avg_pitch < 200:
                pitch_score = 0.7
        else:  # neutral
            if 150 <= avg_pitch <= 250:
                pitch_score = 1.0
        
        # Oscillation scoring (for Karadeniz)
        oscillation_score = 0.5
        if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type:
            if pitch_std > 50:
                oscillation_score = 1.0
            elif pitch_std > 30:
                oscillation_score = 0.7
        
        # Combine scores
        if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type:
            final_score = (tempo_score * 0.4 + pitch_score * 0.3 + oscillation_score * 0.3)
        else:
            final_score = (tempo_score * 0.6 + pitch_score * 0.4)
        
        return min(final_score, 1.0)
    except Exception as e:
        logger.warning(f"Prosody analysis error: {e}")
        return 0.5  # Default neutral score


def dialect_similarity(
    transcription: str,
    audio_data: np.ndarray,
    sample_rate: int
) -> Tuple[Dict[str, float], List[Tuple[str, float]]]:
    """
    Calculate dialect similarity scores for all regions.
    
    Args:
        transcription: ASR transcription
        audio_data: Audio signal
        sample_rate: Sample rate
        
    Returns:
        Dictionary of scores and sorted predictions
    """
    scores: Dict[str, float] = {}

    for region, profile in DIALECT_PROFILES.items():
        try:
            vowel_score = vowel_shift_score(transcription, profile)
            marker_score_val = marker_score(transcription, profile)
            prosody_score_val = prosody_score(audio_data, sample_rate, profile)
            
            # Weighted combination
            combined_score = (
                vowel_score * VOWEL_SHIFT_WEIGHT +
                marker_score_val * MARKER_WEIGHT +
                prosody_score_val * PROSODY_WEIGHT
            )
            
            scores[region] = round(combined_score, 3)
            
            logger.info(
                f"{region}: vowel={vowel_score:.3f}, "
                f"marker={marker_score_val:.3f}, "
                f"prosody={prosody_score_val:.3f}, "
                f"combined={combined_score:.3f}"
            )
        except Exception as e:
            logger.error(f"Error calculating score for {region}: {e}")
            scores[region] = 0.0
    
    # Sort by score
    sorted_predictions = sorted(
        scores.items(),
        key=lambda x: x[1],
        reverse=True
    )
    
    return scores, sorted_predictions


# =========================================
# VISUALIZATION
# =========================================
def plot_region_heatmap(
    scores: Dict[str, float],
    highlight_region: Optional[str] = None
) -> go.Figure:
    """
    Create an interactive choropleth-style region heatmap for Türkiye dialect scores.
    """
    try:
        if not scores:
            raise ValueError("Score verisi yok")

        df = pd.DataFrame({
            "region_name": list(scores.keys()),
            "score": list(scores.values()),
        })

        min_score = float(df["score"].min())
        max_score = float(df["score"].max())
        if min_score == max_score:
            max_score = min_score + 0.01

        fig = px.choropleth_mapbox(
            df,
            geojson=TURKEY_REGIONS_GEOJSON,
            locations="region_name",
            featureidkey="properties.name",
            color="score",
            color_continuous_scale="OrRd",
            range_color=(min_score, max_score),
            mapbox_style="carto-positron",
            zoom=4.5,
            center={"lat": 39.0, "lon": 35.0},
            opacity=0.7,
            labels={"score": "Benzerlik"},
        )

        fig.update_traces(marker_line_width=0.5, marker_line_color="white")

        if highlight_region and highlight_region in df["region_name"].values:
            highlight_df = df[df["region_name"] == highlight_region]
            fig.add_choroplethmapbox(
                geojson=TURKEY_REGIONS_GEOJSON,
                locations=highlight_df["region_name"],
                z=np.ones(len(highlight_df)),
                featureidkey="properties.name",
                colorscale=[[0, "rgba(0,0,0,0)"], [1, "rgba(0,0,0,0)"]],
                showscale=False,
                marker_opacity=0,
                marker_line_width=3,
                marker_line_color="black",
                hovertext=highlight_df["region_name"],
                name="Tahmin",
            )

            fig.add_annotation(
                text=f"🗣 Tahmin: {highlight_region}",
                x=0.5,
                y=0.02,
                xref="paper",
                yref="paper",
                showarrow=False,
                bgcolor="white",
                bordercolor="black",
                borderwidth=1,
                font=dict(size=14),
            )

        fig.update_layout(
            margin=dict(l=10, r=10, t=40, b=10),
            height=600,
            coloraxis_colorbar=dict(title="Benzerlik"),
        )

        return fig
    except Exception as e:
        logger.error(f"Error creating heatmap: {e}")
        fig = go.Figure()
        fig.update_layout(
            title="Harita yüklenemedi",
            height=600
        )
        return fig


# =========================================
# MAIN PIPELINE
# =========================================
def analyze_and_reply(
    audio_path: Optional[str]
) -> Tuple[str, str, str, Optional[str], go.Figure]:
    """
    Full processing pipeline: audio → ASR → dialect analysis → TTS reply.
    """
    def build_empty_fig(title: str = "Harita yüklenemedi") -> go.Figure:
        fig = go.Figure()
        fig.update_layout(title=title, height=600)
        return fig

    logger.info(f"Received audio_path: {audio_path}")
    
    if audio_path is None:
        logger.warning("Audio input is None.")
        empty_fig = build_empty_fig()
        return "Ses alınamadı. Lütfen tekrar deneyin.", "", "", None, empty_fig
    
    # Check if file exists
    if not os.path.exists(audio_path):
        logger.error(f"Audio file does not exist: {audio_path}")
        empty_fig = build_empty_fig()
        return f"Ses dosyası bulunamadı: {audio_path}", "", "", None, empty_fig

    try:
        logger.info(f"Reading audio file: {audio_path}")
        audio_data, sample_rate = sf.read(audio_path)
        logger.info(f"Audio file read successfully. Duration: {len(audio_data)/sample_rate:.2f}s, Sample rate: {sample_rate}Hz")
        if audio_data.ndim > 1:
            audio_data = audio_data.T
            audio_data = librosa.to_mono(audio_data)
        audio_data = np.asarray(audio_data, dtype=np.float32)
    except Exception as e:
        logger.error(f"Error reading audio file: {e}")
        empty_fig = build_empty_fig()
        return f"Ses dosyası okunamadı: {e}", "", "", None, empty_fig

    try:
        processed_audio, processed_sr = process_audio(audio_data, sample_rate)
        validate_audio(processed_audio, processed_sr)
    except ValueError as e:
        logger.error(f"Audio validation error: {e}")
        empty_fig = build_empty_fig()
        return str(e), "", "", None, empty_fig

    try:
        transcript = run_asr(processed_audio, processed_sr)
        logger.info(f"ASR transcript: {transcript}")
    except ValueError as e:
        logger.error(f"ASR error: {e}")
        empty_fig = build_empty_fig()
        return str(e), "", "", None, empty_fig

    # Use transcription-based dialect similarity analysis
    similarity_scores, sorted_predictions = dialect_similarity(
        transcript, processed_audio, processed_sr
    )
    
    # Also try embedding-based prediction as fallback
    embedding_region, embedding_scores = predict_dialect(audio_path)
    
    # Always use transcription-based prediction if available (it should always work)
    if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
        # Use transcription-based prediction
        predicted_region = sorted_predictions[0][0]
        scores = similarity_scores
        top_score = sorted_predictions[0][1]
        logger.info(f"Using transcription-based prediction: {predicted_region} (score: {top_score:.4f})")
        
        # Log top 3 predictions for debugging
        if len(sorted_predictions) >= 3:
            logger.info(f"Top 3 predictions: {[(r, f'{s:.4f}') for r, s in sorted_predictions[:3]]}")
    elif embedding_scores and embedding_region != "Bilinmiyor" and max(embedding_scores.values()) > 0.01:
        # Fallback to embedding-based
        predicted_region = embedding_region
        scores = embedding_scores
        logger.info(f"Using embedding-based prediction: {predicted_region} (score: {max(embedding_scores.values()):.4f})")
    else:
        # Last resort: ensure we always return a region
        if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
            predicted_region = sorted_predictions[0][0]
            scores = similarity_scores
            logger.warning(f"Using transcription-based with low scores: {predicted_region} (score: {sorted_predictions[0][1]:.4f})")
        elif similarity_scores:
            # Use first region from scores even if sorted_predictions is empty
            predicted_region = max(similarity_scores, key=similarity_scores.get)
            scores = similarity_scores
            logger.warning(f"Using first region from scores: {predicted_region}")
        else:
            # Absolute last resort: use first region from DIALECT_PROFILES
            predicted_region = list(DIALECT_PROFILES.keys())[0] if DIALECT_PROFILES else "Bilinmiyor"
            scores = {region: 0.1 for region in DIALECT_PROFILES.keys()} if DIALECT_PROFILES else {}
            logger.error(f"All prediction methods failed, using fallback: {predicted_region}")
    
    reply_text = generate_reply_text(predicted_region)
    reply_audio_path = synthesize_elevenlabs(reply_text) or None
    heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None)

    return (
        transcript,
        predicted_region,
        reply_text,
        reply_audio_path,
        heatmap_fig
    )


# =========================================
# UI — Ultra Modern Apple Glassmorphism Design
# =========================================
CSS = """
* {
    box-sizing: border-box;
    margin: 0;
    padding: 0;
}

@keyframes float {
    0%, 100% { transform: translateY(0px); }
    50% { transform: translateY(-10px); }
}

@keyframes shimmer {
    0% { background-position: -1000px 0; }
    100% { background-position: 1000px 0; }
}

@keyframes pulse {
    0%, 100% { opacity: 1; }
    50% { opacity: 0.7; }
}

body {
    background: 
        radial-gradient(circle at 20% 50%, rgba(120, 119, 198, 0.15) 0%, transparent 50%),
        radial-gradient(circle at 80% 80%, rgba(255, 119, 198, 0.1) 0%, transparent 50%),
        radial-gradient(circle at 40% 20%, rgba(99, 102, 241, 0.1) 0%, transparent 50%),
        linear-gradient(135deg, #F8F9FA 0%, #E9ECEF 50%, #F1F3F5 100%) !important;
    font-family: -apple-system, BlinkMacSystemFont, "SF Pro Display", "SF Pro Text", "Segoe UI", sans-serif;
    -webkit-font-smoothing: antialiased;
    -moz-osx-font-smoothing: grayscale;
    min-height: 100vh;
    position: relative;
    overflow-x: hidden;
}

body::before {
    content: '';
    position: fixed;
    top: 0;
    left: 0;
    right: 0;
    bottom: 0;
    background: 
        radial-gradient(circle at 20% 30%, rgba(99, 102, 241, 0.08) 0%, transparent 50%),
        radial-gradient(circle at 80% 70%, rgba(168, 85, 247, 0.06) 0%, transparent 50%);
    pointer-events: none;
    z-index: 0;
}

.gradio-container {
    background: transparent !important;
    max-width: 1500px !important;
    margin: 0 auto !important;
    padding: 60px 30px !important;
    position: relative;
    z-index: 1;
}

h1 {
    font-weight: 800 !important;
    letter-spacing: -2.5px !important;
    color: #1D1D1F !important;
    margin: 0 !important;
    background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%) !important;
    background-size: 200% auto !important;
    -webkit-background-clip: text !important;
    -webkit-text-fill-color: #1D1D1F !important;
    background-clip: text !important;
    animation: shimmer 3s linear infinite !important;
    opacity: 1 !important;
    z-index: 10 !important;
    position: relative !important;
    visibility: visible !important;
}

.card {
    background: rgba(255, 255, 255, 0.85) !important;
    backdrop-filter: blur(30px) saturate(180%) !important;
    -webkit-backdrop-filter: blur(30px) saturate(180%) !important;
    padding: 28px !important;
    border-radius: 20px !important;
    border: 1px solid rgba(0, 0, 0, 0.08) !important;
    margin-bottom: 20px !important;
    box-shadow: 
        0 8px 32px rgba(0, 0, 0, 0.06),
        0 4px 16px rgba(0, 0, 0, 0.04),
        0 2px 8px rgba(0, 0, 0, 0.03),
        inset 0 1px 0 rgba(255, 255, 255, 0.9) !important;
    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
    position: relative;
    overflow: hidden;
}

.card::before {
    content: '';
    position: absolute;
    top: 0;
    left: -100%;
    width: 100%;
    height: 100%;
    background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.4), transparent);
    transition: left 0.5s;
}

.card:hover::before {
    left: 100%;
}

.card:hover {
    transform: translateY(-4px) scale(1.01) !important;
    box-shadow: 
        0 28px 80px rgba(0, 0, 0, 0.12),
        0 12px 32px rgba(0, 0, 0, 0.08),
        0 4px 12px rgba(0, 0, 0, 0.06),
        inset 0 1px 0 rgba(255, 255, 255, 1),
        inset 0 -1px 0 rgba(255, 255, 255, 0.6) !important;
    border-color: rgba(255, 255, 255, 1) !important;
}

.label {
    font-weight: 700 !important;
    color: #1D1D1F !important;
    margin-bottom: 14px !important;
    font-size: 15px !important;
    letter-spacing: -0.3px !important;
    text-transform: uppercase;
    font-size: 12px;
    opacity: 0.8;
}

.textbox textarea,
.textbox input,
.dropdown select {
    background: rgba(255, 255, 255, 0.95) !important;
    backdrop-filter: blur(20px) saturate(180%) !important;
    -webkit-backdrop-filter: blur(20px) saturate(180%) !important;
    border: 1.5px solid rgba(0, 0, 0, 0.06) !important;
    border-radius: 16px !important;
    color: #1D1D1F !important;
    padding: 16px 20px !important;
    font-size: 15px !important;
    font-weight: 500 !important;
    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
    box-shadow: 
        0 4px 12px rgba(0, 0, 0, 0.04),
        inset 0 1px 2px rgba(0, 0, 0, 0.02) !important;
}

.textbox:focus-within,
.dropdown:focus-within {
    border-color: #007AFF !important;
    box-shadow: 
        0 8px 24px rgba(0, 122, 255, 0.2),
        0 4px 12px rgba(0, 122, 255, 0.15),
        inset 0 1px 2px rgba(0, 122, 255, 0.1) !important;
    transform: translateY(-1px);
}

button.primary {
    background: linear-gradient(135deg, #007AFF 0%, #0051D5 50%, #007AFF 100%) !important;
    background-size: 200% auto !important;
    border: none !important;
    border-radius: 18px !important;
    font-weight: 700 !important;
    padding: 18px 40px !important;
    font-size: 17px !important;
    color: white !important;
    letter-spacing: -0.2px !important;
    box-shadow: 
        0 8px 24px rgba(0, 122, 255, 0.4),
        0 4px 12px rgba(0, 122, 255, 0.3),
        inset 0 1px 0 rgba(255, 255, 255, 0.3),
        inset 0 -1px 0 rgba(0, 0, 0, 0.1) !important;
    transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1) !important;
    cursor: pointer !important;
    position: relative;
    overflow: hidden;
    text-transform: none !important;
}

button.primary::before {
    content: '';
    position: absolute;
    top: 0;
    left: -100%;
    width: 100%;
    height: 100%;
    background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.3), transparent);
    transition: left 0.5s;
}

button.primary:hover::before {
    left: 100%;
}

button.primary:hover {
    transform: translateY(-3px) scale(1.02) !important;
    box-shadow: 
        0 12px 32px rgba(0, 122, 255, 0.5),
        0 6px 16px rgba(0, 122, 255, 0.4),
        inset 0 1px 0 rgba(255, 255, 255, 0.4),
        inset 0 -1px 0 rgba(0, 0, 0, 0.15) !important;
    background-position: right center !important;
}

button.primary:active {
    transform: translateY(-1px) scale(1.01) !important;
    box-shadow: 
        0 4px 16px rgba(0, 122, 255, 0.4),
        inset 0 1px 0 rgba(255, 255, 255, 0.2) !important;
}

.json {
    font-family: "SF Mono", "Monaco", "Menlo", "Courier New", monospace !important;
    font-size: 13px !important;
    background: rgba(248, 249, 250, 0.9) !important;
    backdrop-filter: blur(20px) saturate(180%) !important;
    -webkit-backdrop-filter: blur(20px) saturate(180%) !important;
    border: 1px solid rgba(0, 0, 0, 0.05) !important;
    border-radius: 16px !important;
    padding: 24px !important;
    color: #1D1D1F !important;
    line-height: 1.7 !important;
    box-shadow: 
        inset 0 2px 8px rgba(0, 0, 0, 0.03),
        inset 0 1px 2px rgba(0, 0, 0, 0.02) !important;
}

.markdown {
    color: #1D1D1F !important;
}

.markdown * {
    visibility: visible !important;
    opacity: 1 !important;
    display: block !important;
}

.markdown div {
    display: block !important;
    visibility: visible !important;
    opacity: 1 !important;
    color: inherit !important;
}

.markdown h1, .header-markdown h1, .main-title {
    color: #1D1D1F !important;
    margin-bottom: 16px !important;
    margin-top: 50px !important;
    font-size: 3.5rem !important;
    font-weight: 800 !important;
    letter-spacing: -2px !important;
    line-height: 1.2 !important;
    text-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
    -webkit-text-fill-color: #1D1D1F !important;
    background: none !important;
    background-image: none !important;
    opacity: 1 !important;
    z-index: 10 !important;
    position: relative !important;
    visibility: visible !important;
    display: block !important;
    text-align: center !important;
}

.header-markdown {
    text-align: center !important;
}

.header-markdown p {
    color: #6E6E73 !important;
    font-size: 1.15rem !important;
    margin-top: 8px !important;
    opacity: 0.9 !important;
}

.markdown h1 span {
    color: #1D1D1F !important;
    -webkit-text-fill-color: #1D1D1F !important;
    background: none !important;
    display: inline-block !important;
}

.markdown p {
    display: block !important;
    visibility: visible !important;
    opacity: 1 !important;
    color: #6E6E73 !important;
    margin: 0 !important;
}

.instruction-text {
    display: block !important;
    visibility: visible !important;
    text-align: center !important;
    margin-top: -20px !important;
    margin-bottom: 40px !important;
    color: #6E6E73 !important;
    font-size: 1.1rem !important;
    opacity: 0.9 !important;
    padding: 0 20px !important;
}

.instruction-text p {
    display: block !important;
    visibility: visible !important;
    opacity: 1 !important;
    color: #6E6E73 !important;
    margin: 0 !important;
}

.header-container {
    display: block !important;
    visibility: visible !important;
    opacity: 1 !important;
}

.header-container h1 {
    display: block !important;
    visibility: visible !important;
    opacity: 1 !important;
}

.header-container p {
    display: block !important;
    visibility: visible !important;
    opacity: 1 !important;
}

/* HTML component styles */
.html-component, .html-component * {
    display: block !important;
    visibility: visible !important;
    opacity: 1 !important;
}

.html-component h1, .html-component .main-title {
    color: #1D1D1F !important;
    -webkit-text-fill-color: #1D1D1F !important;
    background: none !important;
    background-image: none !important;
    display: block !important;
}

.html-component p {
    display: block !important;
    visibility: visible !important;
    color: #6E6E73 !important;
}

.markdown p {
    color: #6E6E73;
    opacity: 0.95;
    font-size: 1.15rem;
    font-weight: 400;
    line-height: 1.6;
    letter-spacing: -0.2px;
}

.audio-component {
    background: rgba(255, 255, 255, 0.95) !important;
    backdrop-filter: blur(30px) saturate(200%) !important;
    -webkit-backdrop-filter: blur(30px) saturate(200%) !important;
    border-radius: 20px !important;
    border: 1.5px solid rgba(255, 255, 255, 0.8) !important;
    padding: 20px !important;
    box-shadow: 
        0 8px 24px rgba(0, 0, 0, 0.06),
        0 4px 12px rgba(0, 0, 0, 0.04),
        inset 0 1px 0 rgba(255, 255, 255, 0.9) !important;
    transition: all 0.3s ease !important;
}

.audio-component:hover {
    box-shadow: 
        0 12px 32px rgba(0, 0, 0, 0.08),
        0 6px 16px rgba(0, 0, 0, 0.06),
        inset 0 1px 0 rgba(255, 255, 255, 1) !important;
}

/* Ultra smooth scrollbar */
::-webkit-scrollbar {
    width: 10px;
    height: 10px;
}

::-webkit-scrollbar-track {
    background: rgba(0, 0, 0, 0.02);
    border-radius: 10px;
}

::-webkit-scrollbar-thumb {
    background: linear-gradient(135deg, rgba(0, 122, 255, 0.3), rgba(0, 81, 213, 0.4));
    border-radius: 10px;
    border: 2px solid transparent;
    background-clip: padding-box;
}

::-webkit-scrollbar-thumb:hover {
    background: linear-gradient(135deg, rgba(0, 122, 255, 0.5), rgba(0, 81, 213, 0.6));
    background-clip: padding-box;
}

/* Loading animation */
@keyframes spin {
    from { transform: rotate(0deg); }
    to { transform: rotate(360deg); }
}

/* Enhanced focus states */
*:focus-visible {
    outline: 2px solid #007AFF;
    outline-offset: 2px;
    border-radius: 4px;
}
"""


def build_ui() -> gr.Blocks:
    """
    Build Gradio UI with Apple minimal white + smooth glass design.
    
    Returns:
        Gradio Blocks interface
    """
    with gr.Blocks(
        css=CSS,
        fill_height=True,
        theme=gr.themes.Soft()
    ) as demo:
        
        gr.Markdown(
            """
            # 🇹🇷 Dialect Intelligence Engine
            
            Powered by Meta Omnilingual ASR & Whisper Large-v3
            """,
            elem_classes="header-markdown"
        )

        gr.Markdown(
            """
            Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin.
            """,
            elem_classes="instruction-text"
        )

        with gr.Row(equal_height=False):
            with gr.Column(scale=1, min_width=400):
                audio_input = gr.Audio(
                    sources=["microphone", "upload"],
                    type="filepath",
                    label="🎤 Mikrofona basın, konuşun, kaydı durdurun",
                    show_label=True,
                    interactive=True,
                    elem_classes="card"
                )
                analyze_button = gr.Button(
                    "🔍 Analiz Et ve Şive Tahmini Yap",
                    variant="primary",
                    elem_classes="primary",
                    visible=True,
                    scale=1
                )
                gr.Markdown(
                    "📝 Ses kaydını tamamladıktan sonra butona tıklayın",
                    elem_classes="instruction-text"
                )

            with gr.Column(scale=2, min_width=600):
                transcript_output = gr.Textbox(
                    label="Transcript",
                    lines=4,
                    interactive=False,
                    placeholder="Konuşmanı bekliyorum...",
                    elem_classes="card"
                )

                with gr.Row():
                    predicted_dialect = gr.Textbox(
                        label="Tahmin Edilen Bölge",
                        interactive=False,
                        lines=2,
                        elem_classes="card"
                    )
                    reply_text_output = gr.Textbox(
                        label="Model Cevabı (Metin)",
                        interactive=False,
                        lines=2,
                        elem_classes="card"
                    )

                reply_audio_output = gr.Audio(
                    label="Model Cevabı (Ses)",
                    type="filepath",
                    interactive=False,
                    autoplay=True,
                    elem_classes="card"
                )

        region_map = gr.Plot(
            label="Bölgesel Harita Isı Dağılımı",
            elem_classes="card"
        )
        
        def build_empty_fig_ui():
            """Build empty figure for UI"""
            fig = go.Figure()
            fig.update_layout(title="Harita yüklenemedi", height=600)
            return fig
        
        def analyze_and_reply_with_autoplay(audio_path):
            """Wrapper to ensure audio autoplays after generation"""
            logger.info(f"analyze_and_reply_with_autoplay called with audio_path: {audio_path}")
            if audio_path is None:
                logger.warning("audio_path is None in wrapper")
                empty_fig = build_empty_fig_ui()
                return "", "", "", None, empty_fig
            result = analyze_and_reply(audio_path)
            # Return result - Gradio will handle autoplay if autoplay=True is set
            return result
        
        # Both button click and audio change trigger analysis
        analyze_button.click(
            fn=analyze_and_reply_with_autoplay,
            inputs=audio_input,
            outputs=[
                transcript_output,
                predicted_dialect,
                reply_text_output,
                reply_audio_output,
                region_map
            ]
        )
        
        # Also trigger on change (for file uploads and when recording stops)
        audio_input.change(
            fn=analyze_and_reply_with_autoplay,
            inputs=audio_input,
            outputs=[
                transcript_output,
                predicted_dialect,
                reply_text_output,
                reply_audio_output,
                region_map
            ]
        )
        
        # Add JavaScript for autoplay
        demo.load(
            fn=None,
            js="""
            function() {
                // Auto-play audio when it's updated
                const observer = new MutationObserver(function(mutations) {
                    mutations.forEach(function(mutation) {
                        mutation.addedNodes.forEach(function(node) {
                            if (node.nodeType === 1) {
                                const audio = node.querySelector('audio');
                                if (audio && audio.src && !audio.hasAttribute('data-autoplayed')) {
                                    audio.setAttribute('data-autoplayed', 'true');
                                    audio.play().catch(e => console.log('Autoplay prevented:', e));
                                }
                            }
                        });
                    });
                });
                
                observer.observe(document.body, {
                    childList: true,
                    subtree: true
                });
            }
            """
        )
        
        # Auto-play audio when it's generated using JavaScript callback
        reply_audio_output.change(
            fn=None,
            inputs=None,
            outputs=None,
            js="""
            function() {
                setTimeout(function() {
                    // Find the audio element by looking for the reply audio component
                    const labels = Array.from(document.querySelectorAll('label'));
                    const replyLabel = labels.find(label => 
                        label.textContent && label.textContent.includes('Model Cevabı (Ses)')
                    );
                    
                    if (replyLabel) {
                        const audioContainer = replyLabel.closest('.card') || replyLabel.parentElement;
                        const audioElement = audioContainer ? audioContainer.querySelector('audio') : null;
                        
                        if (audioElement && audioElement.src) {
                            // Reset and play
                            audioElement.currentTime = 0;
                            const playPromise = audioElement.play();
                            if (playPromise !== undefined) {
                                playPromise.catch(function(error) {
                                    console.log('Autoplay prevented by browser:', error);
                                });
                            }
                        }
                    }
                }, 800); // Wait for audio to be fully loaded
                return [];
            }
            """
        )

    return demo


# =========================================
# MAIN
# =========================================
demo = build_ui()

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        ssr_mode=False  # Fix for HF Spaces microphone bug
    )