# ========================================= # ENV FIXES # ========================================= import os os.environ["OMP_NUM_THREADS"] = "1" # libgomp hatası fix # ========================================= # IMPORTS # ========================================= import logging from pathlib import Path from typing import Dict, Tuple, List, Optional, Any from collections import defaultdict from uuid import uuid4 import numpy as np import librosa import soundfile as sf import torch from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoFeatureExtractor, AutoModel import plotly.graph_objects as go import plotly.express as px import pandas as pd import gradio as gr from elevenlabs import ElevenLabs, save from regions_geojson import TURKEY_REGIONS_GEOJSON # ========================================= # LOGGING SETUP # ========================================= logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) # ========================================= # CONSTANTS # ========================================= MODEL_ID = "openai/whisper-large-v3" MIN_AUDIO_DURATION = 3.0 # seconds VOWEL_SHIFT_WEIGHT = 0.35 MARKER_WEIGHT = 0.40 PROSODY_WEIGHT = 0.25 FAST_TEMPO_THRESHOLD = 140.0 SLOW_TEMPO_THRESHOLD = 80.0 TARGET_SAMPLE_RATE = 16000 EMBED_MODEL_ID = "facebook/wav2vec2-large-xlsr-53" # Turkish finetuning otomatik algılanır EMBED_SAMPLE_RATE = 16000 DIALECT_REF_DIR = Path("data/dialects") ELEVENLABS_VOICE_ID = "Q5n6GDIjpN0pLOlycRFT" ELEVENLABS_MODEL_ID = "eleven_multilingual_v2" # ========================================= # DEVICE CONFIGURATION # ========================================= DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 logger.info(f"Using device: {DEVICE}, dtype: {DTYPE}") # ========================================= # MODEL INITIALIZATION # ========================================= try: processor = AutoProcessor.from_pretrained(MODEL_ID) model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_ID, torch_dtype=DTYPE ) model = model.to(DEVICE) model.eval() logger.info("Model loaded successfully") except Exception as e: logger.error(f"Error loading model: {e}") raise # ========================================= # EMBEDDING MODEL INITIALIZATION # Note: Embedding model is disabled - we use transcription-based dialect analysis instead # ========================================= embed_feature_extractor = None embed_model = None logger.debug("Embedding model disabled - using transcription-based analysis only") DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = defaultdict(list) DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {} # ========================================= # ELEVENLABS CLIENT # ========================================= ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY") if ELEVENLABS_API_KEY: try: elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY) logger.info("ElevenLabs client initialized") except Exception as e: elevenlabs_client = None logger.warning(f"Failed to initialize ElevenLabs client: {e}") else: elevenlabs_client = None logger.warning("ELEVENLABS_API_KEY not found. Voice replies will be disabled.") # ========================================= # DIALECT PROFILES # ========================================= DIALECT_PROFILES: Dict[str, Dict[str, Any]] = { "Marmara": { "description": "İstanbul-Trakya şehir aksanı, düz prosodi.", "vowel_shifts": {"a→ı": 0.1, "ı→i": 0.15, "e→i": 0.15, "o→u": 0.1}, "markers": ["abi", "aynen", "bi şey dicem", "baksana"], "prosody": "düz-orta", "tempo_range": (100, 130), "pitch_range": "neutral" }, "Ege": { "description": "Melodik, uzatmalı, 'gari', 'hee' kültürüne sahip.", "vowel_shifts": {"e→ee": 0.85, "o→oo": 0.75, "a→aa": 0.4}, "markers": ["gari", "hee", "ebe", "söyleyiver"], "prosody": "yavaş-uzatmalı", "tempo_range": (60, 90), "pitch_range": "medium" }, "Akdeniz": { "description": "Hızlı, enerjik, 'la' baskın aksan.", "vowel_shifts": {"a→aa": 0.65, "ı→i": 0.35}, "markers": ["la", "naapıyon la", "hee la"], "prosody": "enerjik-hızlı", "tempo_range": (130, 160), "pitch_range": "high" }, "İç Anadolu": { "description": "Düz ritmik, ı/i kaymaları belirgin.", "vowel_shifts": {"ı→i": 0.7, "a→ı": 0.5, "o→u": 0.3}, "markers": ["gelisen", "gideceksen", "hele bi dur"], "prosody": "düz-ritmik", "tempo_range": (100, 125), "pitch_range": "neutral" }, "Karadeniz": { "description": "Yüksek tonlama, hızlı, ünlü daralması.", "vowel_shifts": {"e→i": 0.9, "ö→u": 0.8, "a→e": 0.3}, "markers": ["ha bu", "da gel daa", "nere gideysin"], "prosody": "yüksek-inişli-çıkışlı", "tempo_range": (120, 150), "pitch_range": "high-oscillating" }, "Doğu Anadolu": { "description": "Ağır tempo, geniş ünlü uzatmaları.", "vowel_shifts": {"ı→i": 0.75, "u→o": 0.65, "a→â": 0.4}, "markers": ["he vallah", "gardaş", "ağabey"], "prosody": "düşük-ağır", "tempo_range": (70, 100), "pitch_range": "low" }, "Güneydoğu Anadolu": { "description": "Ê/Î uzatmaları, uzun vurgu, ağır tempo.", "vowel_shifts": {"a→ê": 0.9, "e→ê": 0.95, "i→î": 0.6}, "markers": ["ê", "hele", "gardaş", "bacı"], "prosody": "uzun-vurgulu-ağır", "tempo_range": (65, 95), "pitch_range": "low-elongated" } } # ========================================= # DIALECT EMBEDDING HELPERS # ========================================= REGION_ALIAS_MAP = { region.lower().replace(" ", ""): region for region in DIALECT_PROFILES.keys() } def _resolve_region_from_name(name: str) -> Optional[str]: key = ( name.lower() .replace("-", "") .replace("_", "") .replace(" ", "") ) if key in REGION_ALIAS_MAP: return REGION_ALIAS_MAP[key] for alias_key, region_name in REGION_ALIAS_MAP.items(): if alias_key in key or key in alias_key: return region_name return None def embed_audio(audio_path: str) -> Optional[np.ndarray]: """ Convert an audio file into a fixed-length embedding vector. """ if embed_model is None or embed_feature_extractor is None: logger.warning("Embedding model unavailable; cannot embed audio.") return None try: audio_data, sr = sf.read(audio_path) if audio_data.ndim > 1: audio_data = np.mean(audio_data, axis=1) if sr != EMBED_SAMPLE_RATE: audio_data = librosa.resample( audio_data, orig_sr=sr, target_sr=EMBED_SAMPLE_RATE ) sr = EMBED_SAMPLE_RATE inputs = embed_feature_extractor( audio_data, sampling_rate=sr, return_tensors="pt" ) inputs = {k: v.to(DEVICE) for k, v in inputs.items()} with torch.no_grad(): outputs = embed_model(**inputs) hidden_states = outputs.last_hidden_state embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy() return embedding except Exception as e: logger.error(f"Audio embedding failed: {e}") return None def load_reference_embeddings() -> Tuple[Dict[str, List[np.ndarray]], Dict[str, np.ndarray]]: """ Load reference embeddings for each dialect region from local wav files. """ # Check if embedding model is available (globally defined) try: if embed_model is None or embed_feature_extractor is None: logger.warning("Embedding model missing; reference embeddings disabled.") return {}, {} except NameError: # embed_model not defined - embedding model disabled logger.debug("Embedding model not defined; reference embeddings disabled.") return {}, {} if not DIALECT_REF_DIR.exists(): logger.warning(f"Dialect reference directory not found: {DIALECT_REF_DIR}") return {}, {} embeddings: Dict[str, List[np.ndarray]] = defaultdict(list) for wav_path in sorted(DIALECT_REF_DIR.glob("*.wav")): region_name = _resolve_region_from_name(wav_path.stem) if not region_name: logger.debug(f"Could not resolve region for reference file {wav_path.name}") continue emb = embed_audio(str(wav_path)) if emb is not None: embeddings[region_name].append(emb) prototypes: Dict[str, np.ndarray] = {} for region_name, vectors in embeddings.items(): if vectors: prototypes[region_name] = np.mean(vectors, axis=0) logger.info(f"Loaded {len(vectors)} reference embeddings for {region_name}") if not prototypes: logger.warning("No dialect reference prototypes were built.") return embeddings, prototypes # Embedding model disabled - reference embeddings not needed DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = {} DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {} logger.debug("Embedding model disabled - skipping reference embeddings loading") # ========================================= # ZERO-SHOT DIALECT CLASSIFIER # ========================================= def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float: denom = (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)) + 1e-10 return float(np.dot(vec_a, vec_b) / denom) def predict_dialect(audio_path: str) -> Tuple[str, Dict[str, float]]: """ Predict dialect region using cosine similarity against reference prototypes. """ if not DIALECT_PROTOTYPES: logger.warning("No dialect prototypes available; returning fallback prediction.") return "Bilinmiyor", {} user_embedding = embed_audio(audio_path) if user_embedding is None: return "Bilinmiyor", {} scores: Dict[str, float] = {} for region_name, prototype_vec in DIALECT_PROTOTYPES.items(): similarity = cosine_similarity(user_embedding, prototype_vec) normalized = max(0.0, min(1.0, (similarity + 1) / 2)) scores[region_name] = round(normalized, 4) if not scores: return "Bilinmiyor", {} predicted_region = max(scores, key=scores.get) return predicted_region, scores def generate_reply_text(region: str) -> str: templates = { "Karadeniz": "Aaa, sen demek Karadenizlisin! Hızlı ritim ve enerjik ton hemen belli ediyor kendini. 🌊", "Doğu Anadolu": "Hmm, Doğu’dan bir hava aldım. Güçlü vurgular ve ağır ritim çok tanıdık. 🏔️", "İç Anadolu": "Sende İç Anadolu’nun sakin ve net konuşması var gibi. Rahat ve dengeli. 🚜", "Ege": "Ege rüzgarı gibi yumuşak tınlıyor sesin; huzur veren bir anlatım. 🌅", "Akdeniz": "Akdeniz’in sıcaklığı ve enerjisi var sesinde, çok hareketli! ☀️", "Marmara": "Oldukça dengeli ve şehirli bir ton; Marmara aksanı hissediliyor. 🌆", "Güneydoğu Anadolu": "Güneydoğu’nun uzun vurguları ve sıcaklığı geliyor sesinden. 🔥", } if region in templates: return templates[region] if region and region != "Bilinmiyor": return f"Sesinde {region} bölgesine benzeyen bir tını var. Çok hoş bir karışım yakalamışsın. 🙂" return "Şive tahmin edemedim ama sesin oldukça ilgi çekici!" def synthesize_elevenlabs( text: str, speaking_rate: Optional[float] = None, pitch: Optional[float] = None ) -> Optional[str]: """ Convert reply text into speech using ElevenLabs. """ if not text: return None if not elevenlabs_client: logger.warning("ElevenLabs client unavailable; cannot synthesize audio.") return None voice_settings: Dict[str, Any] = { "stability": 0.4, "similarity_boost": 0.8, } if speaking_rate is not None: voice_settings["speaking_rate"] = speaking_rate if pitch is not None: voice_settings["pitch"] = pitch try: audio = elevenlabs_client.text_to_speech.convert( voice_id=ELEVENLABS_VOICE_ID, model_id=ELEVENLABS_MODEL_ID, text=text, voice_settings=voice_settings, ) out_path = f"reply_{uuid4().hex}.wav" save(audio, out_path) return out_path except Exception as e: logger.error(f"ElevenLabs synthesis failed: {e}") return None # ========================================= # AUDIO PROCESSING # ========================================= def process_audio(audio_data: np.ndarray, sample_rate: int) -> Tuple[np.ndarray, int]: """ Process audio: convert to mono, normalize, resample if needed. Args: audio_data: Audio signal as numpy array sample_rate: Original sample rate Returns: Processed audio data and sample rate """ try: # Convert stereo to mono if needed if len(audio_data.shape) > 1: audio_data = librosa.to_mono(audio_data) logger.info("Converted stereo to mono") # Resample to target rate if needed if sample_rate != TARGET_SAMPLE_RATE: audio_data = librosa.resample( audio_data, orig_sr=sample_rate, target_sr=TARGET_SAMPLE_RATE ) sample_rate = TARGET_SAMPLE_RATE logger.info(f"Resampled to {TARGET_SAMPLE_RATE} Hz") # Normalize audio audio_data = librosa.util.normalize(audio_data) return audio_data, sample_rate except Exception as e: logger.error(f"Error processing audio: {e}") raise ValueError(f"Ses işleme hatası: {e}") def validate_audio(audio_data: np.ndarray, sample_rate: int) -> None: """ Validate audio duration and quality. Args: audio_data: Audio signal sample_rate: Sample rate Raises: ValueError: If audio is invalid """ duration = len(audio_data) / sample_rate if duration < MIN_AUDIO_DURATION: raise ValueError( f"Ses süresi en az {MIN_AUDIO_DURATION} saniye olmalı. " f"Mevcut süre: {duration:.2f} saniye." ) if len(audio_data) == 0: raise ValueError("Ses verisi boş.") # ========================================= # ASR CORE # ========================================= def run_asr(audio_data: np.ndarray, sample_rate: int) -> str: """ Run Whisper ASR on audio. Args: audio_data: Processed audio signal sample_rate: Sample rate Returns: Transcription text """ try: # Ensure audio is float32 (Whisper expects fp32 input) audio_float = audio_data.astype(np.float32) inputs = processor( audio_float, sampling_rate=sample_rate, return_tensors="pt" ) # Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU) input_features = inputs.input_features.to(device=DEVICE, dtype=DTYPE) with torch.no_grad(): generated_ids = model.generate( input_features, max_length=400, language="tr", task="transcribe" ) hypothesis = processor.batch_decode( generated_ids, skip_special_tokens=True )[0] logger.info(f"ASR output: {hypothesis}") return hypothesis except Exception as e: logger.error(f"ASR error: {e}") raise ValueError(f"Konuşma tanıma hatası: {e}") # ========================================= # DIALECT ANALYSIS # ========================================= def vowel_shift_score(transcription: str, profile: Dict[str, Any]) -> float: """ Score vowel shifts in transcription. Enhanced scoring based on phonetic patterns. Args: transcription: ASR transcription profile: Dialect profile Returns: Vowel shift score [0, 1] """ transcription_lower = transcription.lower() shifts = profile["vowel_shifts"] total_weight = sum(shifts.values()) if total_weight == 0: return 0.0 score = 0.0 text_length = len(transcription_lower) for shift_pattern, weight in shifts.items(): if "→" not in shift_pattern: continue source, target = shift_pattern.split("→") # Count occurrences of target vowel/diphthong # For elongated vowels (ee, oo, aa), look for repeated patterns if len(target) > 1 and target[0] == target[1]: # Look for elongated vowels pattern = target[0] * 2 count = transcription_lower.count(pattern) # Also check for common elongated patterns in Turkish count += transcription_lower.count(target[0] + "ğ") count += transcription_lower.count(target[0] + "y") else: count = transcription_lower.count(target) # Normalize by text length normalized_count = count / max(text_length, 1) * 100 score += normalized_count * weight # Normalize by total weight normalized_score = score / (total_weight * 10 + 1e-6) return min(normalized_score, 1.0) def marker_score(transcription: str, profile: Dict[str, Any]) -> float: """ Score lexical markers in transcription. Args: transcription: ASR transcription profile: Dialect profile Returns: Marker score [0, 1] """ transcription_lower = transcription.lower() markers = profile["markers"] if not markers: return 0.0 matches = sum(1 for marker in markers if marker in transcription_lower) # Score based on proportion of markers found score = matches / len(markers) # Bonus for multiple occurrences total_occurrences = sum(transcription_lower.count(marker) for marker in markers) if total_occurrences > len(markers): score = min(score * 1.2, 1.0) return score def prosody_score( audio_data: np.ndarray, sample_rate: int, profile: Dict[str, Any] ) -> float: """ Analyze prosody: tempo, pitch characteristics. Args: audio_data: Audio signal sample_rate: Sample rate profile: Dialect profile Returns: Prosody score [0, 1] """ try: # Normalize audio audio_normalized = librosa.util.normalize(audio_data) # Tempo analysis tempo = float(librosa.beat.tempo(y=audio_normalized, sr=sample_rate)[0]) # Pitch analysis (fundamental frequency) pitches, magnitudes = librosa.piptrack( y=audio_normalized, sr=sample_rate, threshold=0.1 ) # Get pitch statistics pitch_values = [] for t in range(pitches.shape[1]): index = magnitudes[:, t].argmax() pitch = pitches[index, t] if pitch > 0: pitch_values.append(pitch) avg_pitch = np.mean(pitch_values) if pitch_values else 0.0 pitch_std = np.std(pitch_values) if len(pitch_values) > 1 else 0.0 # Score based on profile characteristics prosody_type = profile["prosody"] tempo_range = profile.get("tempo_range", (80, 120)) pitch_range_type = profile.get("pitch_range", "neutral") # Tempo scoring tempo_min, tempo_max = tempo_range if tempo_min <= tempo <= tempo_max: tempo_score = 1.0 else: # Distance from range if tempo < tempo_min: tempo_score = max(0.0, tempo / tempo_min) else: tempo_score = max(0.0, 1.0 - (tempo - tempo_max) / tempo_max) # Pitch scoring based on profile pitch_score = 0.5 # default if pitch_range_type == "high" or pitch_range_type == "high-oscillating": if avg_pitch > 200: pitch_score = 1.0 elif avg_pitch > 150: pitch_score = 0.7 elif pitch_range_type == "low" or pitch_range_type == "low-elongated": if avg_pitch < 150: pitch_score = 1.0 elif avg_pitch < 200: pitch_score = 0.7 else: # neutral if 150 <= avg_pitch <= 250: pitch_score = 1.0 # Oscillation scoring (for Karadeniz) oscillation_score = 0.5 if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type: if pitch_std > 50: oscillation_score = 1.0 elif pitch_std > 30: oscillation_score = 0.7 # Combine scores if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type: final_score = (tempo_score * 0.4 + pitch_score * 0.3 + oscillation_score * 0.3) else: final_score = (tempo_score * 0.6 + pitch_score * 0.4) return min(final_score, 1.0) except Exception as e: logger.warning(f"Prosody analysis error: {e}") return 0.5 # Default neutral score def dialect_similarity( transcription: str, audio_data: np.ndarray, sample_rate: int ) -> Tuple[Dict[str, float], List[Tuple[str, float]]]: """ Calculate dialect similarity scores for all regions. Args: transcription: ASR transcription audio_data: Audio signal sample_rate: Sample rate Returns: Dictionary of scores and sorted predictions """ scores: Dict[str, float] = {} for region, profile in DIALECT_PROFILES.items(): try: vowel_score = vowel_shift_score(transcription, profile) marker_score_val = marker_score(transcription, profile) prosody_score_val = prosody_score(audio_data, sample_rate, profile) # Weighted combination combined_score = ( vowel_score * VOWEL_SHIFT_WEIGHT + marker_score_val * MARKER_WEIGHT + prosody_score_val * PROSODY_WEIGHT ) scores[region] = round(combined_score, 3) logger.info( f"{region}: vowel={vowel_score:.3f}, " f"marker={marker_score_val:.3f}, " f"prosody={prosody_score_val:.3f}, " f"combined={combined_score:.3f}" ) except Exception as e: logger.error(f"Error calculating score for {region}: {e}") scores[region] = 0.0 # Sort by score sorted_predictions = sorted( scores.items(), key=lambda x: x[1], reverse=True ) return scores, sorted_predictions # ========================================= # VISUALIZATION # ========================================= def plot_region_heatmap( scores: Dict[str, float], highlight_region: Optional[str] = None ) -> go.Figure: """ Create an interactive choropleth-style region heatmap for Türkiye dialect scores. """ try: if not scores: raise ValueError("Score verisi yok") df = pd.DataFrame({ "region_name": list(scores.keys()), "score": list(scores.values()), }) min_score = float(df["score"].min()) max_score = float(df["score"].max()) if min_score == max_score: max_score = min_score + 0.01 fig = px.choropleth_mapbox( df, geojson=TURKEY_REGIONS_GEOJSON, locations="region_name", featureidkey="properties.name", color="score", color_continuous_scale="OrRd", range_color=(min_score, max_score), mapbox_style="carto-positron", zoom=4.5, center={"lat": 39.0, "lon": 35.0}, opacity=0.7, labels={"score": "Benzerlik"}, ) fig.update_traces(marker_line_width=0.5, marker_line_color="white") if highlight_region and highlight_region in df["region_name"].values: highlight_df = df[df["region_name"] == highlight_region] fig.add_choroplethmapbox( geojson=TURKEY_REGIONS_GEOJSON, locations=highlight_df["region_name"], z=np.ones(len(highlight_df)), featureidkey="properties.name", colorscale=[[0, "rgba(0,0,0,0)"], [1, "rgba(0,0,0,0)"]], showscale=False, marker_opacity=0, marker_line_width=3, marker_line_color="black", hovertext=highlight_df["region_name"], name="Tahmin", ) fig.add_annotation( text=f"🗣 Tahmin: {highlight_region}", x=0.5, y=0.02, xref="paper", yref="paper", showarrow=False, bgcolor="white", bordercolor="black", borderwidth=1, font=dict(size=14), ) fig.update_layout( margin=dict(l=10, r=10, t=40, b=10), height=600, coloraxis_colorbar=dict(title="Benzerlik"), ) return fig except Exception as e: logger.error(f"Error creating heatmap: {e}") fig = go.Figure() fig.update_layout( title="Harita yüklenemedi", height=600 ) return fig # ========================================= # MAIN PIPELINE # ========================================= def analyze_and_reply( audio_path: Optional[str] ) -> Tuple[str, str, str, Optional[str], go.Figure]: """ Full processing pipeline: audio → ASR → dialect analysis → TTS reply. """ def build_empty_fig(title: str = "Harita yüklenemedi") -> go.Figure: fig = go.Figure() fig.update_layout(title=title, height=600) return fig logger.info(f"Received audio_path: {audio_path}") if audio_path is None: logger.warning("Audio input is None.") empty_fig = build_empty_fig() return "Ses alınamadı. Lütfen tekrar deneyin.", "", "", None, empty_fig # Check if file exists if not os.path.exists(audio_path): logger.error(f"Audio file does not exist: {audio_path}") empty_fig = build_empty_fig() return f"Ses dosyası bulunamadı: {audio_path}", "", "", None, empty_fig try: logger.info(f"Reading audio file: {audio_path}") audio_data, sample_rate = sf.read(audio_path) logger.info(f"Audio file read successfully. Duration: {len(audio_data)/sample_rate:.2f}s, Sample rate: {sample_rate}Hz") if audio_data.ndim > 1: audio_data = audio_data.T audio_data = librosa.to_mono(audio_data) audio_data = np.asarray(audio_data, dtype=np.float32) except Exception as e: logger.error(f"Error reading audio file: {e}") empty_fig = build_empty_fig() return f"Ses dosyası okunamadı: {e}", "", "", None, empty_fig try: processed_audio, processed_sr = process_audio(audio_data, sample_rate) validate_audio(processed_audio, processed_sr) except ValueError as e: logger.error(f"Audio validation error: {e}") empty_fig = build_empty_fig() return str(e), "", "", None, empty_fig try: transcript = run_asr(processed_audio, processed_sr) logger.info(f"ASR transcript: {transcript}") except ValueError as e: logger.error(f"ASR error: {e}") empty_fig = build_empty_fig() return str(e), "", "", None, empty_fig # Use transcription-based dialect similarity analysis similarity_scores, sorted_predictions = dialect_similarity( transcript, processed_audio, processed_sr ) # Also try embedding-based prediction as fallback embedding_region, embedding_scores = predict_dialect(audio_path) # Always use transcription-based prediction if available (it should always work) if similarity_scores and sorted_predictions and len(sorted_predictions) > 0: # Use transcription-based prediction predicted_region = sorted_predictions[0][0] scores = similarity_scores top_score = sorted_predictions[0][1] logger.info(f"Using transcription-based prediction: {predicted_region} (score: {top_score:.4f})") # Log top 3 predictions for debugging if len(sorted_predictions) >= 3: logger.info(f"Top 3 predictions: {[(r, f'{s:.4f}') for r, s in sorted_predictions[:3]]}") elif embedding_scores and embedding_region != "Bilinmiyor" and max(embedding_scores.values()) > 0.01: # Fallback to embedding-based predicted_region = embedding_region scores = embedding_scores logger.info(f"Using embedding-based prediction: {predicted_region} (score: {max(embedding_scores.values()):.4f})") else: # Last resort: ensure we always return a region if similarity_scores and sorted_predictions and len(sorted_predictions) > 0: predicted_region = sorted_predictions[0][0] scores = similarity_scores logger.warning(f"Using transcription-based with low scores: {predicted_region} (score: {sorted_predictions[0][1]:.4f})") elif similarity_scores: # Use first region from scores even if sorted_predictions is empty predicted_region = max(similarity_scores, key=similarity_scores.get) scores = similarity_scores logger.warning(f"Using first region from scores: {predicted_region}") else: # Absolute last resort: use first region from DIALECT_PROFILES predicted_region = list(DIALECT_PROFILES.keys())[0] if DIALECT_PROFILES else "Bilinmiyor" scores = {region: 0.1 for region in DIALECT_PROFILES.keys()} if DIALECT_PROFILES else {} logger.error(f"All prediction methods failed, using fallback: {predicted_region}") reply_text = generate_reply_text(predicted_region) reply_audio_path = synthesize_elevenlabs(reply_text) or None heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None) return ( transcript, predicted_region, reply_text, reply_audio_path, heatmap_fig ) # ========================================= # UI — Ultra Modern Apple Glassmorphism Design # ========================================= CSS = """ * { box-sizing: border-box; margin: 0; padding: 0; } @keyframes float { 0%, 100% { transform: translateY(0px); } 50% { transform: translateY(-10px); } } @keyframes shimmer { 0% { background-position: -1000px 0; } 100% { background-position: 1000px 0; } } @keyframes pulse { 0%, 100% { opacity: 1; } 50% { opacity: 0.7; } } body { background: radial-gradient(circle at 20% 50%, rgba(120, 119, 198, 0.15) 0%, transparent 50%), radial-gradient(circle at 80% 80%, rgba(255, 119, 198, 0.1) 0%, transparent 50%), radial-gradient(circle at 40% 20%, rgba(99, 102, 241, 0.1) 0%, transparent 50%), linear-gradient(135deg, #F8F9FA 0%, #E9ECEF 50%, #F1F3F5 100%) !important; font-family: -apple-system, BlinkMacSystemFont, "SF Pro Display", "SF Pro Text", "Segoe UI", sans-serif; -webkit-font-smoothing: antialiased; -moz-osx-font-smoothing: grayscale; min-height: 100vh; position: relative; overflow-x: hidden; } body::before { content: ''; position: fixed; top: 0; left: 0; right: 0; bottom: 0; background: radial-gradient(circle at 20% 30%, rgba(99, 102, 241, 0.08) 0%, transparent 50%), radial-gradient(circle at 80% 70%, rgba(168, 85, 247, 0.06) 0%, transparent 50%); pointer-events: none; z-index: 0; } .gradio-container { background: transparent !important; max-width: 1500px !important; margin: 0 auto !important; padding: 60px 30px !important; position: relative; z-index: 1; } h1 { font-weight: 800 !important; letter-spacing: -2.5px !important; color: #1D1D1F !important; margin: 0 !important; background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%) !important; background-size: 200% auto !important; -webkit-background-clip: text !important; -webkit-text-fill-color: #1D1D1F !important; background-clip: text !important; animation: shimmer 3s linear infinite !important; opacity: 1 !important; z-index: 10 !important; position: relative !important; visibility: visible !important; } .card { background: rgba(255, 255, 255, 0.85) !important; backdrop-filter: blur(30px) saturate(180%) !important; -webkit-backdrop-filter: blur(30px) saturate(180%) !important; padding: 28px !important; border-radius: 20px !important; border: 1px solid rgba(0, 0, 0, 0.08) !important; margin-bottom: 20px !important; box-shadow: 0 8px 32px rgba(0, 0, 0, 0.06), 0 4px 16px rgba(0, 0, 0, 0.04), 0 2px 8px rgba(0, 0, 0, 0.03), inset 0 1px 0 rgba(255, 255, 255, 0.9) !important; transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; position: relative; overflow: hidden; } .card::before { content: ''; position: absolute; top: 0; left: -100%; width: 100%; height: 100%; background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.4), transparent); transition: left 0.5s; } .card:hover::before { left: 100%; } .card:hover { transform: translateY(-4px) scale(1.01) !important; box-shadow: 0 28px 80px rgba(0, 0, 0, 0.12), 0 12px 32px rgba(0, 0, 0, 0.08), 0 4px 12px rgba(0, 0, 0, 0.06), inset 0 1px 0 rgba(255, 255, 255, 1), inset 0 -1px 0 rgba(255, 255, 255, 0.6) !important; border-color: rgba(255, 255, 255, 1) !important; } .label { font-weight: 700 !important; color: #1D1D1F !important; margin-bottom: 14px !important; font-size: 15px !important; letter-spacing: -0.3px !important; text-transform: uppercase; font-size: 12px; opacity: 0.8; } .textbox textarea, .textbox input, .dropdown select { background: rgba(255, 255, 255, 0.95) !important; backdrop-filter: blur(20px) saturate(180%) !important; -webkit-backdrop-filter: blur(20px) saturate(180%) !important; border: 1.5px solid rgba(0, 0, 0, 0.06) !important; border-radius: 16px !important; color: #1D1D1F !important; padding: 16px 20px !important; font-size: 15px !important; font-weight: 500 !important; transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; box-shadow: 0 4px 12px rgba(0, 0, 0, 0.04), inset 0 1px 2px rgba(0, 0, 0, 0.02) !important; } .textbox:focus-within, .dropdown:focus-within { border-color: #007AFF !important; box-shadow: 0 8px 24px rgba(0, 122, 255, 0.2), 0 4px 12px rgba(0, 122, 255, 0.15), inset 0 1px 2px rgba(0, 122, 255, 0.1) !important; transform: translateY(-1px); } button.primary { background: linear-gradient(135deg, #007AFF 0%, #0051D5 50%, #007AFF 100%) !important; background-size: 200% auto !important; border: none !important; border-radius: 18px !important; font-weight: 700 !important; padding: 18px 40px !important; font-size: 17px !important; color: white !important; letter-spacing: -0.2px !important; box-shadow: 0 8px 24px rgba(0, 122, 255, 0.4), 0 4px 12px rgba(0, 122, 255, 0.3), inset 0 1px 0 rgba(255, 255, 255, 0.3), inset 0 -1px 0 rgba(0, 0, 0, 0.1) !important; transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1) !important; cursor: pointer !important; position: relative; overflow: hidden; text-transform: none !important; } button.primary::before { content: ''; position: absolute; top: 0; left: -100%; width: 100%; height: 100%; background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.3), transparent); transition: left 0.5s; } button.primary:hover::before { left: 100%; } button.primary:hover { transform: translateY(-3px) scale(1.02) !important; box-shadow: 0 12px 32px rgba(0, 122, 255, 0.5), 0 6px 16px rgba(0, 122, 255, 0.4), inset 0 1px 0 rgba(255, 255, 255, 0.4), inset 0 -1px 0 rgba(0, 0, 0, 0.15) !important; background-position: right center !important; } button.primary:active { transform: translateY(-1px) scale(1.01) !important; box-shadow: 0 4px 16px rgba(0, 122, 255, 0.4), inset 0 1px 0 rgba(255, 255, 255, 0.2) !important; } .json { font-family: "SF Mono", "Monaco", "Menlo", "Courier New", monospace !important; font-size: 13px !important; background: rgba(248, 249, 250, 0.9) !important; backdrop-filter: blur(20px) saturate(180%) !important; -webkit-backdrop-filter: blur(20px) saturate(180%) !important; border: 1px solid rgba(0, 0, 0, 0.05) !important; border-radius: 16px !important; padding: 24px !important; color: #1D1D1F !important; line-height: 1.7 !important; box-shadow: inset 0 2px 8px rgba(0, 0, 0, 0.03), inset 0 1px 2px rgba(0, 0, 0, 0.02) !important; } .markdown { color: #1D1D1F !important; } .markdown * { visibility: visible !important; opacity: 1 !important; display: block !important; } .markdown div { display: block !important; visibility: visible !important; opacity: 1 !important; color: inherit !important; } .markdown h1, .header-markdown h1, .main-title { color: #1D1D1F !important; margin-bottom: 16px !important; margin-top: 50px !important; font-size: 3.5rem !important; font-weight: 800 !important; letter-spacing: -2px !important; line-height: 1.2 !important; text-shadow: 0 2px 8px rgba(0,0,0,0.1) !important; -webkit-text-fill-color: #1D1D1F !important; background: none !important; background-image: none !important; opacity: 1 !important; z-index: 10 !important; position: relative !important; visibility: visible !important; display: block !important; text-align: center !important; } .header-markdown { text-align: center !important; } .header-markdown p { color: #6E6E73 !important; font-size: 1.15rem !important; margin-top: 8px !important; opacity: 0.9 !important; } .markdown h1 span { color: #1D1D1F !important; -webkit-text-fill-color: #1D1D1F !important; background: none !important; display: inline-block !important; } .markdown p { display: block !important; visibility: visible !important; opacity: 1 !important; color: #6E6E73 !important; margin: 0 !important; } .instruction-text { display: block !important; visibility: visible !important; text-align: center !important; margin-top: -20px !important; margin-bottom: 40px !important; color: #6E6E73 !important; font-size: 1.1rem !important; opacity: 0.9 !important; padding: 0 20px !important; } .instruction-text p { display: block !important; visibility: visible !important; opacity: 1 !important; color: #6E6E73 !important; margin: 0 !important; } .header-container { display: block !important; visibility: visible !important; opacity: 1 !important; } .header-container h1 { display: block !important; visibility: visible !important; opacity: 1 !important; } .header-container p { display: block !important; visibility: visible !important; opacity: 1 !important; } /* HTML component styles */ .html-component, .html-component * { display: block !important; visibility: visible !important; opacity: 1 !important; } .html-component h1, .html-component .main-title { color: #1D1D1F !important; -webkit-text-fill-color: #1D1D1F !important; background: none !important; background-image: none !important; display: block !important; } .html-component p { display: block !important; visibility: visible !important; color: #6E6E73 !important; } .markdown p { color: #6E6E73; opacity: 0.95; font-size: 1.15rem; font-weight: 400; line-height: 1.6; letter-spacing: -0.2px; } .audio-component { background: rgba(255, 255, 255, 0.95) !important; backdrop-filter: blur(30px) saturate(200%) !important; -webkit-backdrop-filter: blur(30px) saturate(200%) !important; border-radius: 20px !important; border: 1.5px solid rgba(255, 255, 255, 0.8) !important; padding: 20px !important; box-shadow: 0 8px 24px rgba(0, 0, 0, 0.06), 0 4px 12px rgba(0, 0, 0, 0.04), inset 0 1px 0 rgba(255, 255, 255, 0.9) !important; transition: all 0.3s ease !important; } .audio-component:hover { box-shadow: 0 12px 32px rgba(0, 0, 0, 0.08), 0 6px 16px rgba(0, 0, 0, 0.06), inset 0 1px 0 rgba(255, 255, 255, 1) !important; } /* Ultra smooth scrollbar */ ::-webkit-scrollbar { width: 10px; height: 10px; } ::-webkit-scrollbar-track { background: rgba(0, 0, 0, 0.02); border-radius: 10px; } ::-webkit-scrollbar-thumb { background: linear-gradient(135deg, rgba(0, 122, 255, 0.3), rgba(0, 81, 213, 0.4)); border-radius: 10px; border: 2px solid transparent; background-clip: padding-box; } ::-webkit-scrollbar-thumb:hover { background: linear-gradient(135deg, rgba(0, 122, 255, 0.5), rgba(0, 81, 213, 0.6)); background-clip: padding-box; } /* Loading animation */ @keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } /* Enhanced focus states */ *:focus-visible { outline: 2px solid #007AFF; outline-offset: 2px; border-radius: 4px; } """ def build_ui() -> gr.Blocks: """ Build Gradio UI with Apple minimal white + smooth glass design. Returns: Gradio Blocks interface """ with gr.Blocks( css=CSS, fill_height=True, theme=gr.themes.Soft() ) as demo: gr.Markdown( """ # 🇹🇷 Dialect Intelligence Engine Powered by Meta Omnilingual ASR & Whisper Large-v3 """, elem_classes="header-markdown" ) gr.Markdown( """ Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin. """, elem_classes="instruction-text" ) with gr.Row(equal_height=False): with gr.Column(scale=1, min_width=400): audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="🎤 Mikrofona basın, konuşun, kaydı durdurun", show_label=True, interactive=True, elem_classes="card" ) analyze_button = gr.Button( "🔍 Analiz Et ve Şive Tahmini Yap", variant="primary", elem_classes="primary", visible=True, scale=1 ) gr.Markdown( "📝 Ses kaydını tamamladıktan sonra butona tıklayın", elem_classes="instruction-text" ) with gr.Column(scale=2, min_width=600): transcript_output = gr.Textbox( label="Transcript", lines=4, interactive=False, placeholder="Konuşmanı bekliyorum...", elem_classes="card" ) with gr.Row(): predicted_dialect = gr.Textbox( label="Tahmin Edilen Bölge", interactive=False, lines=2, elem_classes="card" ) reply_text_output = gr.Textbox( label="Model Cevabı (Metin)", interactive=False, lines=2, elem_classes="card" ) reply_audio_output = gr.Audio( label="Model Cevabı (Ses)", type="filepath", interactive=False, autoplay=True, elem_classes="card" ) region_map = gr.Plot( label="Bölgesel Harita Isı Dağılımı", elem_classes="card" ) def build_empty_fig_ui(): """Build empty figure for UI""" fig = go.Figure() fig.update_layout(title="Harita yüklenemedi", height=600) return fig def analyze_and_reply_with_autoplay(audio_path): """Wrapper to ensure audio autoplays after generation""" logger.info(f"analyze_and_reply_with_autoplay called with audio_path: {audio_path}") if audio_path is None: logger.warning("audio_path is None in wrapper") empty_fig = build_empty_fig_ui() return "", "", "", None, empty_fig result = analyze_and_reply(audio_path) # Return result - Gradio will handle autoplay if autoplay=True is set return result # Both button click and audio change trigger analysis analyze_button.click( fn=analyze_and_reply_with_autoplay, inputs=audio_input, outputs=[ transcript_output, predicted_dialect, reply_text_output, reply_audio_output, region_map ] ) # Also trigger on change (for file uploads and when recording stops) audio_input.change( fn=analyze_and_reply_with_autoplay, inputs=audio_input, outputs=[ transcript_output, predicted_dialect, reply_text_output, reply_audio_output, region_map ] ) # Add JavaScript for autoplay demo.load( fn=None, js=""" function() { // Auto-play audio when it's updated const observer = new MutationObserver(function(mutations) { mutations.forEach(function(mutation) { mutation.addedNodes.forEach(function(node) { if (node.nodeType === 1) { const audio = node.querySelector('audio'); if (audio && audio.src && !audio.hasAttribute('data-autoplayed')) { audio.setAttribute('data-autoplayed', 'true'); audio.play().catch(e => console.log('Autoplay prevented:', e)); } } }); }); }); observer.observe(document.body, { childList: true, subtree: true }); } """ ) # Auto-play audio when it's generated using JavaScript callback reply_audio_output.change( fn=None, inputs=None, outputs=None, js=""" function() { setTimeout(function() { // Find the audio element by looking for the reply audio component const labels = Array.from(document.querySelectorAll('label')); const replyLabel = labels.find(label => label.textContent && label.textContent.includes('Model Cevabı (Ses)') ); if (replyLabel) { const audioContainer = replyLabel.closest('.card') || replyLabel.parentElement; const audioElement = audioContainer ? audioContainer.querySelector('audio') : null; if (audioElement && audioElement.src) { // Reset and play audioElement.currentTime = 0; const playPromise = audioElement.play(); if (playPromise !== undefined) { playPromise.catch(function(error) { console.log('Autoplay prevented by browser:', error); }); } } } }, 800); // Wait for audio to be fully loaded return []; } """ ) return demo # ========================================= # MAIN # ========================================= demo = build_ui() if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, ssr_mode=False # Fix for HF Spaces microphone bug )