nisacayir's picture
update 4 111lbs
4f03bf3 verified
# =========================================
# ENV FIXES
# =========================================
import os
os.environ["OMP_NUM_THREADS"] = "1" # libgomp hatası fix
# =========================================
# IMPORTS
# =========================================
import logging
from pathlib import Path
from typing import Dict, Tuple, List, Optional, Any
from collections import defaultdict
from uuid import uuid4
import numpy as np
import librosa
import soundfile as sf
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoFeatureExtractor, AutoModel
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import gradio as gr
from elevenlabs import ElevenLabs, save
from regions_geojson import TURKEY_REGIONS_GEOJSON
# =========================================
# LOGGING SETUP
# =========================================
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# =========================================
# CONSTANTS
# =========================================
MODEL_ID = "openai/whisper-large-v3"
MIN_AUDIO_DURATION = 3.0 # seconds
VOWEL_SHIFT_WEIGHT = 0.35
MARKER_WEIGHT = 0.40
PROSODY_WEIGHT = 0.25
FAST_TEMPO_THRESHOLD = 140.0
SLOW_TEMPO_THRESHOLD = 80.0
TARGET_SAMPLE_RATE = 16000
EMBED_MODEL_ID = "facebook/wav2vec2-large-xlsr-53" # Turkish finetuning otomatik algılanır
EMBED_SAMPLE_RATE = 16000
DIALECT_REF_DIR = Path("data/dialects")
ELEVENLABS_VOICE_ID = "Q5n6GDIjpN0pLOlycRFT"
ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
# =========================================
# DEVICE CONFIGURATION
# =========================================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
logger.info(f"Using device: {DEVICE}, dtype: {DTYPE}")
# =========================================
# MODEL INITIALIZATION
# =========================================
try:
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_ID,
torch_dtype=DTYPE
)
model = model.to(DEVICE)
model.eval()
logger.info("Model loaded successfully")
except Exception as e:
logger.error(f"Error loading model: {e}")
raise
# =========================================
# EMBEDDING MODEL INITIALIZATION
# Note: Embedding model is disabled - we use transcription-based dialect analysis instead
# =========================================
embed_feature_extractor = None
embed_model = None
logger.debug("Embedding model disabled - using transcription-based analysis only")
DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = defaultdict(list)
DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {}
# =========================================
# ELEVENLABS CLIENT
# =========================================
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY")
if ELEVENLABS_API_KEY:
try:
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY)
logger.info("ElevenLabs client initialized")
except Exception as e:
elevenlabs_client = None
logger.warning(f"Failed to initialize ElevenLabs client: {e}")
else:
elevenlabs_client = None
logger.warning("ELEVENLABS_API_KEY not found. Voice replies will be disabled.")
# =========================================
# DIALECT PROFILES
# =========================================
DIALECT_PROFILES: Dict[str, Dict[str, Any]] = {
"Marmara": {
"description": "İstanbul-Trakya şehir aksanı, düz prosodi.",
"vowel_shifts": {"a→ı": 0.1, "ı→i": 0.15, "e→i": 0.15, "o→u": 0.1},
"markers": ["abi", "aynen", "bi şey dicem", "baksana"],
"prosody": "düz-orta",
"tempo_range": (100, 130),
"pitch_range": "neutral"
},
"Ege": {
"description": "Melodik, uzatmalı, 'gari', 'hee' kültürüne sahip.",
"vowel_shifts": {"e→ee": 0.85, "o→oo": 0.75, "a→aa": 0.4},
"markers": ["gari", "hee", "ebe", "söyleyiver"],
"prosody": "yavaş-uzatmalı",
"tempo_range": (60, 90),
"pitch_range": "medium"
},
"Akdeniz": {
"description": "Hızlı, enerjik, 'la' baskın aksan.",
"vowel_shifts": {"a→aa": 0.65, "ı→i": 0.35},
"markers": ["la", "naapıyon la", "hee la"],
"prosody": "enerjik-hızlı",
"tempo_range": (130, 160),
"pitch_range": "high"
},
"İç Anadolu": {
"description": "Düz ritmik, ı/i kaymaları belirgin.",
"vowel_shifts": {"ı→i": 0.7, "a→ı": 0.5, "o→u": 0.3},
"markers": ["gelisen", "gideceksen", "hele bi dur"],
"prosody": "düz-ritmik",
"tempo_range": (100, 125),
"pitch_range": "neutral"
},
"Karadeniz": {
"description": "Yüksek tonlama, hızlı, ünlü daralması.",
"vowel_shifts": {"e→i": 0.9, "ö→u": 0.8, "a→e": 0.3},
"markers": ["ha bu", "da gel daa", "nere gideysin"],
"prosody": "yüksek-inişli-çıkışlı",
"tempo_range": (120, 150),
"pitch_range": "high-oscillating"
},
"Doğu Anadolu": {
"description": "Ağır tempo, geniş ünlü uzatmaları.",
"vowel_shifts": {"ı→i": 0.75, "u→o": 0.65, "a→â": 0.4},
"markers": ["he vallah", "gardaş", "ağabey"],
"prosody": "düşük-ağır",
"tempo_range": (70, 100),
"pitch_range": "low"
},
"Güneydoğu Anadolu": {
"description": "Ê/Î uzatmaları, uzun vurgu, ağır tempo.",
"vowel_shifts": {"a→ê": 0.9, "e→ê": 0.95, "i→î": 0.6},
"markers": ["ê", "hele", "gardaş", "bacı"],
"prosody": "uzun-vurgulu-ağır",
"tempo_range": (65, 95),
"pitch_range": "low-elongated"
}
}
# =========================================
# DIALECT EMBEDDING HELPERS
# =========================================
REGION_ALIAS_MAP = {
region.lower().replace(" ", ""): region
for region in DIALECT_PROFILES.keys()
}
def _resolve_region_from_name(name: str) -> Optional[str]:
key = (
name.lower()
.replace("-", "")
.replace("_", "")
.replace(" ", "")
)
if key in REGION_ALIAS_MAP:
return REGION_ALIAS_MAP[key]
for alias_key, region_name in REGION_ALIAS_MAP.items():
if alias_key in key or key in alias_key:
return region_name
return None
def embed_audio(audio_path: str) -> Optional[np.ndarray]:
"""
Convert an audio file into a fixed-length embedding vector.
"""
if embed_model is None or embed_feature_extractor is None:
logger.warning("Embedding model unavailable; cannot embed audio.")
return None
try:
audio_data, sr = sf.read(audio_path)
if audio_data.ndim > 1:
audio_data = np.mean(audio_data, axis=1)
if sr != EMBED_SAMPLE_RATE:
audio_data = librosa.resample(
audio_data,
orig_sr=sr,
target_sr=EMBED_SAMPLE_RATE
)
sr = EMBED_SAMPLE_RATE
inputs = embed_feature_extractor(
audio_data,
sampling_rate=sr,
return_tensors="pt"
)
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
with torch.no_grad():
outputs = embed_model(**inputs)
hidden_states = outputs.last_hidden_state
embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()
return embedding
except Exception as e:
logger.error(f"Audio embedding failed: {e}")
return None
def load_reference_embeddings() -> Tuple[Dict[str, List[np.ndarray]], Dict[str, np.ndarray]]:
"""
Load reference embeddings for each dialect region from local wav files.
"""
# Check if embedding model is available (globally defined)
try:
if embed_model is None or embed_feature_extractor is None:
logger.warning("Embedding model missing; reference embeddings disabled.")
return {}, {}
except NameError:
# embed_model not defined - embedding model disabled
logger.debug("Embedding model not defined; reference embeddings disabled.")
return {}, {}
if not DIALECT_REF_DIR.exists():
logger.warning(f"Dialect reference directory not found: {DIALECT_REF_DIR}")
return {}, {}
embeddings: Dict[str, List[np.ndarray]] = defaultdict(list)
for wav_path in sorted(DIALECT_REF_DIR.glob("*.wav")):
region_name = _resolve_region_from_name(wav_path.stem)
if not region_name:
logger.debug(f"Could not resolve region for reference file {wav_path.name}")
continue
emb = embed_audio(str(wav_path))
if emb is not None:
embeddings[region_name].append(emb)
prototypes: Dict[str, np.ndarray] = {}
for region_name, vectors in embeddings.items():
if vectors:
prototypes[region_name] = np.mean(vectors, axis=0)
logger.info(f"Loaded {len(vectors)} reference embeddings for {region_name}")
if not prototypes:
logger.warning("No dialect reference prototypes were built.")
return embeddings, prototypes
# Embedding model disabled - reference embeddings not needed
DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = {}
DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {}
logger.debug("Embedding model disabled - skipping reference embeddings loading")
# =========================================
# ZERO-SHOT DIALECT CLASSIFIER
# =========================================
def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float:
denom = (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)) + 1e-10
return float(np.dot(vec_a, vec_b) / denom)
def predict_dialect(audio_path: str) -> Tuple[str, Dict[str, float]]:
"""
Predict dialect region using cosine similarity against reference prototypes.
"""
if not DIALECT_PROTOTYPES:
logger.warning("No dialect prototypes available; returning fallback prediction.")
return "Bilinmiyor", {}
user_embedding = embed_audio(audio_path)
if user_embedding is None:
return "Bilinmiyor", {}
scores: Dict[str, float] = {}
for region_name, prototype_vec in DIALECT_PROTOTYPES.items():
similarity = cosine_similarity(user_embedding, prototype_vec)
normalized = max(0.0, min(1.0, (similarity + 1) / 2))
scores[region_name] = round(normalized, 4)
if not scores:
return "Bilinmiyor", {}
predicted_region = max(scores, key=scores.get)
return predicted_region, scores
def generate_reply_text(region: str) -> str:
templates = {
"Karadeniz": "Aaa, sen demek Karadenizlisin! Hızlı ritim ve enerjik ton hemen belli ediyor kendini. 🌊",
"Doğu Anadolu": "Hmm, Doğu’dan bir hava aldım. Güçlü vurgular ve ağır ritim çok tanıdık. 🏔️",
"İç Anadolu": "Sende İç Anadolu’nun sakin ve net konuşması var gibi. Rahat ve dengeli. 🚜",
"Ege": "Ege rüzgarı gibi yumuşak tınlıyor sesin; huzur veren bir anlatım. 🌅",
"Akdeniz": "Akdeniz’in sıcaklığı ve enerjisi var sesinde, çok hareketli! ☀️",
"Marmara": "Oldukça dengeli ve şehirli bir ton; Marmara aksanı hissediliyor. 🌆",
"Güneydoğu Anadolu": "Güneydoğu’nun uzun vurguları ve sıcaklığı geliyor sesinden. 🔥",
}
if region in templates:
return templates[region]
if region and region != "Bilinmiyor":
return f"Sesinde {region} bölgesine benzeyen bir tını var. Çok hoş bir karışım yakalamışsın. 🙂"
return "Şive tahmin edemedim ama sesin oldukça ilgi çekici!"
def synthesize_elevenlabs(
text: str,
speaking_rate: Optional[float] = None,
pitch: Optional[float] = None
) -> Optional[str]:
"""
Convert reply text into speech using ElevenLabs.
"""
if not text:
return None
if not elevenlabs_client:
logger.warning("ElevenLabs client unavailable; cannot synthesize audio.")
return None
voice_settings: Dict[str, Any] = {
"stability": 0.4,
"similarity_boost": 0.8,
}
if speaking_rate is not None:
voice_settings["speaking_rate"] = speaking_rate
if pitch is not None:
voice_settings["pitch"] = pitch
try:
audio = elevenlabs_client.text_to_speech.convert(
voice_id=ELEVENLABS_VOICE_ID,
model_id=ELEVENLABS_MODEL_ID,
text=text,
voice_settings=voice_settings,
)
out_path = f"reply_{uuid4().hex}.wav"
save(audio, out_path)
return out_path
except Exception as e:
logger.error(f"ElevenLabs synthesis failed: {e}")
return None
# =========================================
# AUDIO PROCESSING
# =========================================
def process_audio(audio_data: np.ndarray, sample_rate: int) -> Tuple[np.ndarray, int]:
"""
Process audio: convert to mono, normalize, resample if needed.
Args:
audio_data: Audio signal as numpy array
sample_rate: Original sample rate
Returns:
Processed audio data and sample rate
"""
try:
# Convert stereo to mono if needed
if len(audio_data.shape) > 1:
audio_data = librosa.to_mono(audio_data)
logger.info("Converted stereo to mono")
# Resample to target rate if needed
if sample_rate != TARGET_SAMPLE_RATE:
audio_data = librosa.resample(
audio_data,
orig_sr=sample_rate,
target_sr=TARGET_SAMPLE_RATE
)
sample_rate = TARGET_SAMPLE_RATE
logger.info(f"Resampled to {TARGET_SAMPLE_RATE} Hz")
# Normalize audio
audio_data = librosa.util.normalize(audio_data)
return audio_data, sample_rate
except Exception as e:
logger.error(f"Error processing audio: {e}")
raise ValueError(f"Ses işleme hatası: {e}")
def validate_audio(audio_data: np.ndarray, sample_rate: int) -> None:
"""
Validate audio duration and quality.
Args:
audio_data: Audio signal
sample_rate: Sample rate
Raises:
ValueError: If audio is invalid
"""
duration = len(audio_data) / sample_rate
if duration < MIN_AUDIO_DURATION:
raise ValueError(
f"Ses süresi en az {MIN_AUDIO_DURATION} saniye olmalı. "
f"Mevcut süre: {duration:.2f} saniye."
)
if len(audio_data) == 0:
raise ValueError("Ses verisi boş.")
# =========================================
# ASR CORE
# =========================================
def run_asr(audio_data: np.ndarray, sample_rate: int) -> str:
"""
Run Whisper ASR on audio.
Args:
audio_data: Processed audio signal
sample_rate: Sample rate
Returns:
Transcription text
"""
try:
# Ensure audio is float32 (Whisper expects fp32 input)
audio_float = audio_data.astype(np.float32)
inputs = processor(
audio_float,
sampling_rate=sample_rate,
return_tensors="pt"
)
# Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU)
input_features = inputs.input_features.to(device=DEVICE, dtype=DTYPE)
with torch.no_grad():
generated_ids = model.generate(
input_features,
max_length=400,
language="tr",
task="transcribe"
)
hypothesis = processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0]
logger.info(f"ASR output: {hypothesis}")
return hypothesis
except Exception as e:
logger.error(f"ASR error: {e}")
raise ValueError(f"Konuşma tanıma hatası: {e}")
# =========================================
# DIALECT ANALYSIS
# =========================================
def vowel_shift_score(transcription: str, profile: Dict[str, Any]) -> float:
"""
Score vowel shifts in transcription.
Enhanced scoring based on phonetic patterns.
Args:
transcription: ASR transcription
profile: Dialect profile
Returns:
Vowel shift score [0, 1]
"""
transcription_lower = transcription.lower()
shifts = profile["vowel_shifts"]
total_weight = sum(shifts.values())
if total_weight == 0:
return 0.0
score = 0.0
text_length = len(transcription_lower)
for shift_pattern, weight in shifts.items():
if "→" not in shift_pattern:
continue
source, target = shift_pattern.split("→")
# Count occurrences of target vowel/diphthong
# For elongated vowels (ee, oo, aa), look for repeated patterns
if len(target) > 1 and target[0] == target[1]:
# Look for elongated vowels
pattern = target[0] * 2
count = transcription_lower.count(pattern)
# Also check for common elongated patterns in Turkish
count += transcription_lower.count(target[0] + "ğ")
count += transcription_lower.count(target[0] + "y")
else:
count = transcription_lower.count(target)
# Normalize by text length
normalized_count = count / max(text_length, 1) * 100
score += normalized_count * weight
# Normalize by total weight
normalized_score = score / (total_weight * 10 + 1e-6)
return min(normalized_score, 1.0)
def marker_score(transcription: str, profile: Dict[str, Any]) -> float:
"""
Score lexical markers in transcription.
Args:
transcription: ASR transcription
profile: Dialect profile
Returns:
Marker score [0, 1]
"""
transcription_lower = transcription.lower()
markers = profile["markers"]
if not markers:
return 0.0
matches = sum(1 for marker in markers if marker in transcription_lower)
# Score based on proportion of markers found
score = matches / len(markers)
# Bonus for multiple occurrences
total_occurrences = sum(transcription_lower.count(marker) for marker in markers)
if total_occurrences > len(markers):
score = min(score * 1.2, 1.0)
return score
def prosody_score(
audio_data: np.ndarray,
sample_rate: int,
profile: Dict[str, Any]
) -> float:
"""
Analyze prosody: tempo, pitch characteristics.
Args:
audio_data: Audio signal
sample_rate: Sample rate
profile: Dialect profile
Returns:
Prosody score [0, 1]
"""
try:
# Normalize audio
audio_normalized = librosa.util.normalize(audio_data)
# Tempo analysis
tempo = float(librosa.beat.tempo(y=audio_normalized, sr=sample_rate)[0])
# Pitch analysis (fundamental frequency)
pitches, magnitudes = librosa.piptrack(
y=audio_normalized,
sr=sample_rate,
threshold=0.1
)
# Get pitch statistics
pitch_values = []
for t in range(pitches.shape[1]):
index = magnitudes[:, t].argmax()
pitch = pitches[index, t]
if pitch > 0:
pitch_values.append(pitch)
avg_pitch = np.mean(pitch_values) if pitch_values else 0.0
pitch_std = np.std(pitch_values) if len(pitch_values) > 1 else 0.0
# Score based on profile characteristics
prosody_type = profile["prosody"]
tempo_range = profile.get("tempo_range", (80, 120))
pitch_range_type = profile.get("pitch_range", "neutral")
# Tempo scoring
tempo_min, tempo_max = tempo_range
if tempo_min <= tempo <= tempo_max:
tempo_score = 1.0
else:
# Distance from range
if tempo < tempo_min:
tempo_score = max(0.0, tempo / tempo_min)
else:
tempo_score = max(0.0, 1.0 - (tempo - tempo_max) / tempo_max)
# Pitch scoring based on profile
pitch_score = 0.5 # default
if pitch_range_type == "high" or pitch_range_type == "high-oscillating":
if avg_pitch > 200:
pitch_score = 1.0
elif avg_pitch > 150:
pitch_score = 0.7
elif pitch_range_type == "low" or pitch_range_type == "low-elongated":
if avg_pitch < 150:
pitch_score = 1.0
elif avg_pitch < 200:
pitch_score = 0.7
else: # neutral
if 150 <= avg_pitch <= 250:
pitch_score = 1.0
# Oscillation scoring (for Karadeniz)
oscillation_score = 0.5
if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type:
if pitch_std > 50:
oscillation_score = 1.0
elif pitch_std > 30:
oscillation_score = 0.7
# Combine scores
if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type:
final_score = (tempo_score * 0.4 + pitch_score * 0.3 + oscillation_score * 0.3)
else:
final_score = (tempo_score * 0.6 + pitch_score * 0.4)
return min(final_score, 1.0)
except Exception as e:
logger.warning(f"Prosody analysis error: {e}")
return 0.5 # Default neutral score
def dialect_similarity(
transcription: str,
audio_data: np.ndarray,
sample_rate: int
) -> Tuple[Dict[str, float], List[Tuple[str, float]]]:
"""
Calculate dialect similarity scores for all regions.
Args:
transcription: ASR transcription
audio_data: Audio signal
sample_rate: Sample rate
Returns:
Dictionary of scores and sorted predictions
"""
scores: Dict[str, float] = {}
for region, profile in DIALECT_PROFILES.items():
try:
vowel_score = vowel_shift_score(transcription, profile)
marker_score_val = marker_score(transcription, profile)
prosody_score_val = prosody_score(audio_data, sample_rate, profile)
# Weighted combination
combined_score = (
vowel_score * VOWEL_SHIFT_WEIGHT +
marker_score_val * MARKER_WEIGHT +
prosody_score_val * PROSODY_WEIGHT
)
scores[region] = round(combined_score, 3)
logger.info(
f"{region}: vowel={vowel_score:.3f}, "
f"marker={marker_score_val:.3f}, "
f"prosody={prosody_score_val:.3f}, "
f"combined={combined_score:.3f}"
)
except Exception as e:
logger.error(f"Error calculating score for {region}: {e}")
scores[region] = 0.0
# Sort by score
sorted_predictions = sorted(
scores.items(),
key=lambda x: x[1],
reverse=True
)
return scores, sorted_predictions
# =========================================
# VISUALIZATION
# =========================================
def plot_region_heatmap(
scores: Dict[str, float],
highlight_region: Optional[str] = None
) -> go.Figure:
"""
Create an interactive choropleth-style region heatmap for Türkiye dialect scores.
"""
try:
if not scores:
raise ValueError("Score verisi yok")
df = pd.DataFrame({
"region_name": list(scores.keys()),
"score": list(scores.values()),
})
min_score = float(df["score"].min())
max_score = float(df["score"].max())
if min_score == max_score:
max_score = min_score + 0.01
fig = px.choropleth_mapbox(
df,
geojson=TURKEY_REGIONS_GEOJSON,
locations="region_name",
featureidkey="properties.name",
color="score",
color_continuous_scale="OrRd",
range_color=(min_score, max_score),
mapbox_style="carto-positron",
zoom=4.5,
center={"lat": 39.0, "lon": 35.0},
opacity=0.7,
labels={"score": "Benzerlik"},
)
fig.update_traces(marker_line_width=0.5, marker_line_color="white")
if highlight_region and highlight_region in df["region_name"].values:
highlight_df = df[df["region_name"] == highlight_region]
fig.add_choroplethmapbox(
geojson=TURKEY_REGIONS_GEOJSON,
locations=highlight_df["region_name"],
z=np.ones(len(highlight_df)),
featureidkey="properties.name",
colorscale=[[0, "rgba(0,0,0,0)"], [1, "rgba(0,0,0,0)"]],
showscale=False,
marker_opacity=0,
marker_line_width=3,
marker_line_color="black",
hovertext=highlight_df["region_name"],
name="Tahmin",
)
fig.add_annotation(
text=f"🗣 Tahmin: {highlight_region}",
x=0.5,
y=0.02,
xref="paper",
yref="paper",
showarrow=False,
bgcolor="white",
bordercolor="black",
borderwidth=1,
font=dict(size=14),
)
fig.update_layout(
margin=dict(l=10, r=10, t=40, b=10),
height=600,
coloraxis_colorbar=dict(title="Benzerlik"),
)
return fig
except Exception as e:
logger.error(f"Error creating heatmap: {e}")
fig = go.Figure()
fig.update_layout(
title="Harita yüklenemedi",
height=600
)
return fig
# =========================================
# MAIN PIPELINE
# =========================================
def analyze_and_reply(
audio_path: Optional[str]
) -> Tuple[str, str, str, Optional[str], go.Figure]:
"""
Full processing pipeline: audio → ASR → dialect analysis → TTS reply.
"""
def build_empty_fig(title: str = "Harita yüklenemedi") -> go.Figure:
fig = go.Figure()
fig.update_layout(title=title, height=600)
return fig
logger.info(f"Received audio_path: {audio_path}")
if audio_path is None:
logger.warning("Audio input is None.")
empty_fig = build_empty_fig()
return "Ses alınamadı. Lütfen tekrar deneyin.", "", "", None, empty_fig
# Check if file exists
if not os.path.exists(audio_path):
logger.error(f"Audio file does not exist: {audio_path}")
empty_fig = build_empty_fig()
return f"Ses dosyası bulunamadı: {audio_path}", "", "", None, empty_fig
try:
logger.info(f"Reading audio file: {audio_path}")
audio_data, sample_rate = sf.read(audio_path)
logger.info(f"Audio file read successfully. Duration: {len(audio_data)/sample_rate:.2f}s, Sample rate: {sample_rate}Hz")
if audio_data.ndim > 1:
audio_data = audio_data.T
audio_data = librosa.to_mono(audio_data)
audio_data = np.asarray(audio_data, dtype=np.float32)
except Exception as e:
logger.error(f"Error reading audio file: {e}")
empty_fig = build_empty_fig()
return f"Ses dosyası okunamadı: {e}", "", "", None, empty_fig
try:
processed_audio, processed_sr = process_audio(audio_data, sample_rate)
validate_audio(processed_audio, processed_sr)
except ValueError as e:
logger.error(f"Audio validation error: {e}")
empty_fig = build_empty_fig()
return str(e), "", "", None, empty_fig
try:
transcript = run_asr(processed_audio, processed_sr)
logger.info(f"ASR transcript: {transcript}")
except ValueError as e:
logger.error(f"ASR error: {e}")
empty_fig = build_empty_fig()
return str(e), "", "", None, empty_fig
# Use transcription-based dialect similarity analysis
similarity_scores, sorted_predictions = dialect_similarity(
transcript, processed_audio, processed_sr
)
# Also try embedding-based prediction as fallback
embedding_region, embedding_scores = predict_dialect(audio_path)
# Always use transcription-based prediction if available (it should always work)
if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
# Use transcription-based prediction
predicted_region = sorted_predictions[0][0]
scores = similarity_scores
top_score = sorted_predictions[0][1]
logger.info(f"Using transcription-based prediction: {predicted_region} (score: {top_score:.4f})")
# Log top 3 predictions for debugging
if len(sorted_predictions) >= 3:
logger.info(f"Top 3 predictions: {[(r, f'{s:.4f}') for r, s in sorted_predictions[:3]]}")
elif embedding_scores and embedding_region != "Bilinmiyor" and max(embedding_scores.values()) > 0.01:
# Fallback to embedding-based
predicted_region = embedding_region
scores = embedding_scores
logger.info(f"Using embedding-based prediction: {predicted_region} (score: {max(embedding_scores.values()):.4f})")
else:
# Last resort: ensure we always return a region
if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
predicted_region = sorted_predictions[0][0]
scores = similarity_scores
logger.warning(f"Using transcription-based with low scores: {predicted_region} (score: {sorted_predictions[0][1]:.4f})")
elif similarity_scores:
# Use first region from scores even if sorted_predictions is empty
predicted_region = max(similarity_scores, key=similarity_scores.get)
scores = similarity_scores
logger.warning(f"Using first region from scores: {predicted_region}")
else:
# Absolute last resort: use first region from DIALECT_PROFILES
predicted_region = list(DIALECT_PROFILES.keys())[0] if DIALECT_PROFILES else "Bilinmiyor"
scores = {region: 0.1 for region in DIALECT_PROFILES.keys()} if DIALECT_PROFILES else {}
logger.error(f"All prediction methods failed, using fallback: {predicted_region}")
reply_text = generate_reply_text(predicted_region)
reply_audio_path = synthesize_elevenlabs(reply_text) or None
heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None)
return (
transcript,
predicted_region,
reply_text,
reply_audio_path,
heatmap_fig
)
# =========================================
# UI — Ultra Modern Apple Glassmorphism Design
# =========================================
CSS = """
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
@keyframes float {
0%, 100% { transform: translateY(0px); }
50% { transform: translateY(-10px); }
}
@keyframes shimmer {
0% { background-position: -1000px 0; }
100% { background-position: 1000px 0; }
}
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.7; }
}
body {
background:
radial-gradient(circle at 20% 50%, rgba(120, 119, 198, 0.15) 0%, transparent 50%),
radial-gradient(circle at 80% 80%, rgba(255, 119, 198, 0.1) 0%, transparent 50%),
radial-gradient(circle at 40% 20%, rgba(99, 102, 241, 0.1) 0%, transparent 50%),
linear-gradient(135deg, #F8F9FA 0%, #E9ECEF 50%, #F1F3F5 100%) !important;
font-family: -apple-system, BlinkMacSystemFont, "SF Pro Display", "SF Pro Text", "Segoe UI", sans-serif;
-webkit-font-smoothing: antialiased;
-moz-osx-font-smoothing: grayscale;
min-height: 100vh;
position: relative;
overflow-x: hidden;
}
body::before {
content: '';
position: fixed;
top: 0;
left: 0;
right: 0;
bottom: 0;
background:
radial-gradient(circle at 20% 30%, rgba(99, 102, 241, 0.08) 0%, transparent 50%),
radial-gradient(circle at 80% 70%, rgba(168, 85, 247, 0.06) 0%, transparent 50%);
pointer-events: none;
z-index: 0;
}
.gradio-container {
background: transparent !important;
max-width: 1500px !important;
margin: 0 auto !important;
padding: 60px 30px !important;
position: relative;
z-index: 1;
}
h1 {
font-weight: 800 !important;
letter-spacing: -2.5px !important;
color: #1D1D1F !important;
margin: 0 !important;
background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%) !important;
background-size: 200% auto !important;
-webkit-background-clip: text !important;
-webkit-text-fill-color: #1D1D1F !important;
background-clip: text !important;
animation: shimmer 3s linear infinite !important;
opacity: 1 !important;
z-index: 10 !important;
position: relative !important;
visibility: visible !important;
}
.card {
background: rgba(255, 255, 255, 0.85) !important;
backdrop-filter: blur(30px) saturate(180%) !important;
-webkit-backdrop-filter: blur(30px) saturate(180%) !important;
padding: 28px !important;
border-radius: 20px !important;
border: 1px solid rgba(0, 0, 0, 0.08) !important;
margin-bottom: 20px !important;
box-shadow:
0 8px 32px rgba(0, 0, 0, 0.06),
0 4px 16px rgba(0, 0, 0, 0.04),
0 2px 8px rgba(0, 0, 0, 0.03),
inset 0 1px 0 rgba(255, 255, 255, 0.9) !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
position: relative;
overflow: hidden;
}
.card::before {
content: '';
position: absolute;
top: 0;
left: -100%;
width: 100%;
height: 100%;
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.4), transparent);
transition: left 0.5s;
}
.card:hover::before {
left: 100%;
}
.card:hover {
transform: translateY(-4px) scale(1.01) !important;
box-shadow:
0 28px 80px rgba(0, 0, 0, 0.12),
0 12px 32px rgba(0, 0, 0, 0.08),
0 4px 12px rgba(0, 0, 0, 0.06),
inset 0 1px 0 rgba(255, 255, 255, 1),
inset 0 -1px 0 rgba(255, 255, 255, 0.6) !important;
border-color: rgba(255, 255, 255, 1) !important;
}
.label {
font-weight: 700 !important;
color: #1D1D1F !important;
margin-bottom: 14px !important;
font-size: 15px !important;
letter-spacing: -0.3px !important;
text-transform: uppercase;
font-size: 12px;
opacity: 0.8;
}
.textbox textarea,
.textbox input,
.dropdown select {
background: rgba(255, 255, 255, 0.95) !important;
backdrop-filter: blur(20px) saturate(180%) !important;
-webkit-backdrop-filter: blur(20px) saturate(180%) !important;
border: 1.5px solid rgba(0, 0, 0, 0.06) !important;
border-radius: 16px !important;
color: #1D1D1F !important;
padding: 16px 20px !important;
font-size: 15px !important;
font-weight: 500 !important;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
box-shadow:
0 4px 12px rgba(0, 0, 0, 0.04),
inset 0 1px 2px rgba(0, 0, 0, 0.02) !important;
}
.textbox:focus-within,
.dropdown:focus-within {
border-color: #007AFF !important;
box-shadow:
0 8px 24px rgba(0, 122, 255, 0.2),
0 4px 12px rgba(0, 122, 255, 0.15),
inset 0 1px 2px rgba(0, 122, 255, 0.1) !important;
transform: translateY(-1px);
}
button.primary {
background: linear-gradient(135deg, #007AFF 0%, #0051D5 50%, #007AFF 100%) !important;
background-size: 200% auto !important;
border: none !important;
border-radius: 18px !important;
font-weight: 700 !important;
padding: 18px 40px !important;
font-size: 17px !important;
color: white !important;
letter-spacing: -0.2px !important;
box-shadow:
0 8px 24px rgba(0, 122, 255, 0.4),
0 4px 12px rgba(0, 122, 255, 0.3),
inset 0 1px 0 rgba(255, 255, 255, 0.3),
inset 0 -1px 0 rgba(0, 0, 0, 0.1) !important;
transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1) !important;
cursor: pointer !important;
position: relative;
overflow: hidden;
text-transform: none !important;
}
button.primary::before {
content: '';
position: absolute;
top: 0;
left: -100%;
width: 100%;
height: 100%;
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.3), transparent);
transition: left 0.5s;
}
button.primary:hover::before {
left: 100%;
}
button.primary:hover {
transform: translateY(-3px) scale(1.02) !important;
box-shadow:
0 12px 32px rgba(0, 122, 255, 0.5),
0 6px 16px rgba(0, 122, 255, 0.4),
inset 0 1px 0 rgba(255, 255, 255, 0.4),
inset 0 -1px 0 rgba(0, 0, 0, 0.15) !important;
background-position: right center !important;
}
button.primary:active {
transform: translateY(-1px) scale(1.01) !important;
box-shadow:
0 4px 16px rgba(0, 122, 255, 0.4),
inset 0 1px 0 rgba(255, 255, 255, 0.2) !important;
}
.json {
font-family: "SF Mono", "Monaco", "Menlo", "Courier New", monospace !important;
font-size: 13px !important;
background: rgba(248, 249, 250, 0.9) !important;
backdrop-filter: blur(20px) saturate(180%) !important;
-webkit-backdrop-filter: blur(20px) saturate(180%) !important;
border: 1px solid rgba(0, 0, 0, 0.05) !important;
border-radius: 16px !important;
padding: 24px !important;
color: #1D1D1F !important;
line-height: 1.7 !important;
box-shadow:
inset 0 2px 8px rgba(0, 0, 0, 0.03),
inset 0 1px 2px rgba(0, 0, 0, 0.02) !important;
}
.markdown {
color: #1D1D1F !important;
}
.markdown * {
visibility: visible !important;
opacity: 1 !important;
display: block !important;
}
.markdown div {
display: block !important;
visibility: visible !important;
opacity: 1 !important;
color: inherit !important;
}
.markdown h1, .header-markdown h1, .main-title {
color: #1D1D1F !important;
margin-bottom: 16px !important;
margin-top: 50px !important;
font-size: 3.5rem !important;
font-weight: 800 !important;
letter-spacing: -2px !important;
line-height: 1.2 !important;
text-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
-webkit-text-fill-color: #1D1D1F !important;
background: none !important;
background-image: none !important;
opacity: 1 !important;
z-index: 10 !important;
position: relative !important;
visibility: visible !important;
display: block !important;
text-align: center !important;
}
.header-markdown {
text-align: center !important;
}
.header-markdown p {
color: #6E6E73 !important;
font-size: 1.15rem !important;
margin-top: 8px !important;
opacity: 0.9 !important;
}
.markdown h1 span {
color: #1D1D1F !important;
-webkit-text-fill-color: #1D1D1F !important;
background: none !important;
display: inline-block !important;
}
.markdown p {
display: block !important;
visibility: visible !important;
opacity: 1 !important;
color: #6E6E73 !important;
margin: 0 !important;
}
.instruction-text {
display: block !important;
visibility: visible !important;
text-align: center !important;
margin-top: -20px !important;
margin-bottom: 40px !important;
color: #6E6E73 !important;
font-size: 1.1rem !important;
opacity: 0.9 !important;
padding: 0 20px !important;
}
.instruction-text p {
display: block !important;
visibility: visible !important;
opacity: 1 !important;
color: #6E6E73 !important;
margin: 0 !important;
}
.header-container {
display: block !important;
visibility: visible !important;
opacity: 1 !important;
}
.header-container h1 {
display: block !important;
visibility: visible !important;
opacity: 1 !important;
}
.header-container p {
display: block !important;
visibility: visible !important;
opacity: 1 !important;
}
/* HTML component styles */
.html-component, .html-component * {
display: block !important;
visibility: visible !important;
opacity: 1 !important;
}
.html-component h1, .html-component .main-title {
color: #1D1D1F !important;
-webkit-text-fill-color: #1D1D1F !important;
background: none !important;
background-image: none !important;
display: block !important;
}
.html-component p {
display: block !important;
visibility: visible !important;
color: #6E6E73 !important;
}
.markdown p {
color: #6E6E73;
opacity: 0.95;
font-size: 1.15rem;
font-weight: 400;
line-height: 1.6;
letter-spacing: -0.2px;
}
.audio-component {
background: rgba(255, 255, 255, 0.95) !important;
backdrop-filter: blur(30px) saturate(200%) !important;
-webkit-backdrop-filter: blur(30px) saturate(200%) !important;
border-radius: 20px !important;
border: 1.5px solid rgba(255, 255, 255, 0.8) !important;
padding: 20px !important;
box-shadow:
0 8px 24px rgba(0, 0, 0, 0.06),
0 4px 12px rgba(0, 0, 0, 0.04),
inset 0 1px 0 rgba(255, 255, 255, 0.9) !important;
transition: all 0.3s ease !important;
}
.audio-component:hover {
box-shadow:
0 12px 32px rgba(0, 0, 0, 0.08),
0 6px 16px rgba(0, 0, 0, 0.06),
inset 0 1px 0 rgba(255, 255, 255, 1) !important;
}
/* Ultra smooth scrollbar */
::-webkit-scrollbar {
width: 10px;
height: 10px;
}
::-webkit-scrollbar-track {
background: rgba(0, 0, 0, 0.02);
border-radius: 10px;
}
::-webkit-scrollbar-thumb {
background: linear-gradient(135deg, rgba(0, 122, 255, 0.3), rgba(0, 81, 213, 0.4));
border-radius: 10px;
border: 2px solid transparent;
background-clip: padding-box;
}
::-webkit-scrollbar-thumb:hover {
background: linear-gradient(135deg, rgba(0, 122, 255, 0.5), rgba(0, 81, 213, 0.6));
background-clip: padding-box;
}
/* Loading animation */
@keyframes spin {
from { transform: rotate(0deg); }
to { transform: rotate(360deg); }
}
/* Enhanced focus states */
*:focus-visible {
outline: 2px solid #007AFF;
outline-offset: 2px;
border-radius: 4px;
}
"""
def build_ui() -> gr.Blocks:
"""
Build Gradio UI with Apple minimal white + smooth glass design.
Returns:
Gradio Blocks interface
"""
with gr.Blocks(
css=CSS,
fill_height=True,
theme=gr.themes.Soft()
) as demo:
gr.Markdown(
"""
# 🇹🇷 Dialect Intelligence Engine
Powered by Meta Omnilingual ASR & Whisper Large-v3
""",
elem_classes="header-markdown"
)
gr.Markdown(
"""
Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin.
""",
elem_classes="instruction-text"
)
with gr.Row(equal_height=False):
with gr.Column(scale=1, min_width=400):
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="🎤 Mikrofona basın, konuşun, kaydı durdurun",
show_label=True,
interactive=True,
elem_classes="card"
)
analyze_button = gr.Button(
"🔍 Analiz Et ve Şive Tahmini Yap",
variant="primary",
elem_classes="primary",
visible=True,
scale=1
)
gr.Markdown(
"📝 Ses kaydını tamamladıktan sonra butona tıklayın",
elem_classes="instruction-text"
)
with gr.Column(scale=2, min_width=600):
transcript_output = gr.Textbox(
label="Transcript",
lines=4,
interactive=False,
placeholder="Konuşmanı bekliyorum...",
elem_classes="card"
)
with gr.Row():
predicted_dialect = gr.Textbox(
label="Tahmin Edilen Bölge",
interactive=False,
lines=2,
elem_classes="card"
)
reply_text_output = gr.Textbox(
label="Model Cevabı (Metin)",
interactive=False,
lines=2,
elem_classes="card"
)
reply_audio_output = gr.Audio(
label="Model Cevabı (Ses)",
type="filepath",
interactive=False,
autoplay=True,
elem_classes="card"
)
region_map = gr.Plot(
label="Bölgesel Harita Isı Dağılımı",
elem_classes="card"
)
def build_empty_fig_ui():
"""Build empty figure for UI"""
fig = go.Figure()
fig.update_layout(title="Harita yüklenemedi", height=600)
return fig
def analyze_and_reply_with_autoplay(audio_path):
"""Wrapper to ensure audio autoplays after generation"""
logger.info(f"analyze_and_reply_with_autoplay called with audio_path: {audio_path}")
if audio_path is None:
logger.warning("audio_path is None in wrapper")
empty_fig = build_empty_fig_ui()
return "", "", "", None, empty_fig
result = analyze_and_reply(audio_path)
# Return result - Gradio will handle autoplay if autoplay=True is set
return result
# Both button click and audio change trigger analysis
analyze_button.click(
fn=analyze_and_reply_with_autoplay,
inputs=audio_input,
outputs=[
transcript_output,
predicted_dialect,
reply_text_output,
reply_audio_output,
region_map
]
)
# Also trigger on change (for file uploads and when recording stops)
audio_input.change(
fn=analyze_and_reply_with_autoplay,
inputs=audio_input,
outputs=[
transcript_output,
predicted_dialect,
reply_text_output,
reply_audio_output,
region_map
]
)
# Add JavaScript for autoplay
demo.load(
fn=None,
js="""
function() {
// Auto-play audio when it's updated
const observer = new MutationObserver(function(mutations) {
mutations.forEach(function(mutation) {
mutation.addedNodes.forEach(function(node) {
if (node.nodeType === 1) {
const audio = node.querySelector('audio');
if (audio && audio.src && !audio.hasAttribute('data-autoplayed')) {
audio.setAttribute('data-autoplayed', 'true');
audio.play().catch(e => console.log('Autoplay prevented:', e));
}
}
});
});
});
observer.observe(document.body, {
childList: true,
subtree: true
});
}
"""
)
# Auto-play audio when it's generated using JavaScript callback
reply_audio_output.change(
fn=None,
inputs=None,
outputs=None,
js="""
function() {
setTimeout(function() {
// Find the audio element by looking for the reply audio component
const labels = Array.from(document.querySelectorAll('label'));
const replyLabel = labels.find(label =>
label.textContent && label.textContent.includes('Model Cevabı (Ses)')
);
if (replyLabel) {
const audioContainer = replyLabel.closest('.card') || replyLabel.parentElement;
const audioElement = audioContainer ? audioContainer.querySelector('audio') : null;
if (audioElement && audioElement.src) {
// Reset and play
audioElement.currentTime = 0;
const playPromise = audioElement.play();
if (playPromise !== undefined) {
playPromise.catch(function(error) {
console.log('Autoplay prevented by browser:', error);
});
}
}
}
}, 800); // Wait for audio to be fully loaded
return [];
}
"""
)
return demo
# =========================================
# MAIN
# =========================================
demo = build_ui()
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
ssr_mode=False # Fix for HF Spaces microphone bug
)