|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
|
|
|
os.environ["OMP_NUM_THREADS"] = "1" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import logging |
|
|
from pathlib import Path |
|
|
from typing import Dict, Tuple, List, Optional, Any |
|
|
from collections import defaultdict |
|
|
from uuid import uuid4 |
|
|
|
|
|
import numpy as np |
|
|
import librosa |
|
|
import soundfile as sf |
|
|
import torch |
|
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoFeatureExtractor, AutoModel |
|
|
import plotly.graph_objects as go |
|
|
import plotly.express as px |
|
|
import pandas as pd |
|
|
|
|
|
import gradio as gr |
|
|
from elevenlabs import ElevenLabs, save |
|
|
|
|
|
from regions_geojson import TURKEY_REGIONS_GEOJSON |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "openai/whisper-large-v3" |
|
|
MIN_AUDIO_DURATION = 3.0 |
|
|
VOWEL_SHIFT_WEIGHT = 0.35 |
|
|
MARKER_WEIGHT = 0.40 |
|
|
PROSODY_WEIGHT = 0.25 |
|
|
FAST_TEMPO_THRESHOLD = 140.0 |
|
|
SLOW_TEMPO_THRESHOLD = 80.0 |
|
|
TARGET_SAMPLE_RATE = 16000 |
|
|
EMBED_MODEL_ID = "facebook/wav2vec2-large-xlsr-53" |
|
|
EMBED_SAMPLE_RATE = 16000 |
|
|
DIALECT_REF_DIR = Path("data/dialects") |
|
|
ELEVENLABS_VOICE_ID = "Q5n6GDIjpN0pLOlycRFT" |
|
|
ELEVENLABS_MODEL_ID = "eleven_multilingual_v2" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 |
|
|
|
|
|
logger.info(f"Using device: {DEVICE}, dtype: {DTYPE}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
processor = AutoProcessor.from_pretrained(MODEL_ID) |
|
|
|
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained( |
|
|
MODEL_ID, |
|
|
torch_dtype=DTYPE |
|
|
) |
|
|
|
|
|
model = model.to(DEVICE) |
|
|
model.eval() |
|
|
|
|
|
logger.info("Model loaded successfully") |
|
|
except Exception as e: |
|
|
logger.error(f"Error loading model: {e}") |
|
|
raise |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embed_feature_extractor = None |
|
|
embed_model = None |
|
|
logger.debug("Embedding model disabled - using transcription-based analysis only") |
|
|
|
|
|
DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = defaultdict(list) |
|
|
DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ELEVENLABS_API_KEY = os.environ.get("ELEVENLABS_API_KEY") |
|
|
if ELEVENLABS_API_KEY: |
|
|
try: |
|
|
elevenlabs_client = ElevenLabs(api_key=ELEVENLABS_API_KEY) |
|
|
logger.info("ElevenLabs client initialized") |
|
|
except Exception as e: |
|
|
elevenlabs_client = None |
|
|
logger.warning(f"Failed to initialize ElevenLabs client: {e}") |
|
|
else: |
|
|
elevenlabs_client = None |
|
|
logger.warning("ELEVENLABS_API_KEY not found. Voice replies will be disabled.") |
|
|
|
|
|
|
|
|
|
|
|
DIALECT_PROFILES: Dict[str, Dict[str, Any]] = { |
|
|
"Marmara": { |
|
|
"description": "İstanbul-Trakya şehir aksanı, düz prosodi.", |
|
|
"vowel_shifts": {"a→ı": 0.1, "ı→i": 0.15, "e→i": 0.15, "o→u": 0.1}, |
|
|
"markers": ["abi", "aynen", "bi şey dicem", "baksana"], |
|
|
"prosody": "düz-orta", |
|
|
"tempo_range": (100, 130), |
|
|
"pitch_range": "neutral" |
|
|
}, |
|
|
"Ege": { |
|
|
"description": "Melodik, uzatmalı, 'gari', 'hee' kültürüne sahip.", |
|
|
"vowel_shifts": {"e→ee": 0.85, "o→oo": 0.75, "a→aa": 0.4}, |
|
|
"markers": ["gari", "hee", "ebe", "söyleyiver"], |
|
|
"prosody": "yavaş-uzatmalı", |
|
|
"tempo_range": (60, 90), |
|
|
"pitch_range": "medium" |
|
|
}, |
|
|
"Akdeniz": { |
|
|
"description": "Hızlı, enerjik, 'la' baskın aksan.", |
|
|
"vowel_shifts": {"a→aa": 0.65, "ı→i": 0.35}, |
|
|
"markers": ["la", "naapıyon la", "hee la"], |
|
|
"prosody": "enerjik-hızlı", |
|
|
"tempo_range": (130, 160), |
|
|
"pitch_range": "high" |
|
|
}, |
|
|
"İç Anadolu": { |
|
|
"description": "Düz ritmik, ı/i kaymaları belirgin.", |
|
|
"vowel_shifts": {"ı→i": 0.7, "a→ı": 0.5, "o→u": 0.3}, |
|
|
"markers": ["gelisen", "gideceksen", "hele bi dur"], |
|
|
"prosody": "düz-ritmik", |
|
|
"tempo_range": (100, 125), |
|
|
"pitch_range": "neutral" |
|
|
}, |
|
|
"Karadeniz": { |
|
|
"description": "Yüksek tonlama, hızlı, ünlü daralması.", |
|
|
"vowel_shifts": {"e→i": 0.9, "ö→u": 0.8, "a→e": 0.3}, |
|
|
"markers": ["ha bu", "da gel daa", "nere gideysin"], |
|
|
"prosody": "yüksek-inişli-çıkışlı", |
|
|
"tempo_range": (120, 150), |
|
|
"pitch_range": "high-oscillating" |
|
|
}, |
|
|
"Doğu Anadolu": { |
|
|
"description": "Ağır tempo, geniş ünlü uzatmaları.", |
|
|
"vowel_shifts": {"ı→i": 0.75, "u→o": 0.65, "a→â": 0.4}, |
|
|
"markers": ["he vallah", "gardaş", "ağabey"], |
|
|
"prosody": "düşük-ağır", |
|
|
"tempo_range": (70, 100), |
|
|
"pitch_range": "low" |
|
|
}, |
|
|
"Güneydoğu Anadolu": { |
|
|
"description": "Ê/Î uzatmaları, uzun vurgu, ağır tempo.", |
|
|
"vowel_shifts": {"a→ê": 0.9, "e→ê": 0.95, "i→î": 0.6}, |
|
|
"markers": ["ê", "hele", "gardaş", "bacı"], |
|
|
"prosody": "uzun-vurgulu-ağır", |
|
|
"tempo_range": (65, 95), |
|
|
"pitch_range": "low-elongated" |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
REGION_ALIAS_MAP = { |
|
|
region.lower().replace(" ", ""): region |
|
|
for region in DIALECT_PROFILES.keys() |
|
|
} |
|
|
|
|
|
|
|
|
def _resolve_region_from_name(name: str) -> Optional[str]: |
|
|
key = ( |
|
|
name.lower() |
|
|
.replace("-", "") |
|
|
.replace("_", "") |
|
|
.replace(" ", "") |
|
|
) |
|
|
if key in REGION_ALIAS_MAP: |
|
|
return REGION_ALIAS_MAP[key] |
|
|
for alias_key, region_name in REGION_ALIAS_MAP.items(): |
|
|
if alias_key in key or key in alias_key: |
|
|
return region_name |
|
|
return None |
|
|
|
|
|
|
|
|
def embed_audio(audio_path: str) -> Optional[np.ndarray]: |
|
|
""" |
|
|
Convert an audio file into a fixed-length embedding vector. |
|
|
""" |
|
|
if embed_model is None or embed_feature_extractor is None: |
|
|
logger.warning("Embedding model unavailable; cannot embed audio.") |
|
|
return None |
|
|
try: |
|
|
audio_data, sr = sf.read(audio_path) |
|
|
if audio_data.ndim > 1: |
|
|
audio_data = np.mean(audio_data, axis=1) |
|
|
if sr != EMBED_SAMPLE_RATE: |
|
|
audio_data = librosa.resample( |
|
|
audio_data, |
|
|
orig_sr=sr, |
|
|
target_sr=EMBED_SAMPLE_RATE |
|
|
) |
|
|
sr = EMBED_SAMPLE_RATE |
|
|
|
|
|
inputs = embed_feature_extractor( |
|
|
audio_data, |
|
|
sampling_rate=sr, |
|
|
return_tensors="pt" |
|
|
) |
|
|
inputs = {k: v.to(DEVICE) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = embed_model(**inputs) |
|
|
hidden_states = outputs.last_hidden_state |
|
|
embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy() |
|
|
return embedding |
|
|
except Exception as e: |
|
|
logger.error(f"Audio embedding failed: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def load_reference_embeddings() -> Tuple[Dict[str, List[np.ndarray]], Dict[str, np.ndarray]]: |
|
|
""" |
|
|
Load reference embeddings for each dialect region from local wav files. |
|
|
""" |
|
|
|
|
|
try: |
|
|
if embed_model is None or embed_feature_extractor is None: |
|
|
logger.warning("Embedding model missing; reference embeddings disabled.") |
|
|
return {}, {} |
|
|
except NameError: |
|
|
|
|
|
logger.debug("Embedding model not defined; reference embeddings disabled.") |
|
|
return {}, {} |
|
|
|
|
|
if not DIALECT_REF_DIR.exists(): |
|
|
logger.warning(f"Dialect reference directory not found: {DIALECT_REF_DIR}") |
|
|
return {}, {} |
|
|
|
|
|
embeddings: Dict[str, List[np.ndarray]] = defaultdict(list) |
|
|
for wav_path in sorted(DIALECT_REF_DIR.glob("*.wav")): |
|
|
region_name = _resolve_region_from_name(wav_path.stem) |
|
|
if not region_name: |
|
|
logger.debug(f"Could not resolve region for reference file {wav_path.name}") |
|
|
continue |
|
|
emb = embed_audio(str(wav_path)) |
|
|
if emb is not None: |
|
|
embeddings[region_name].append(emb) |
|
|
|
|
|
prototypes: Dict[str, np.ndarray] = {} |
|
|
for region_name, vectors in embeddings.items(): |
|
|
if vectors: |
|
|
prototypes[region_name] = np.mean(vectors, axis=0) |
|
|
logger.info(f"Loaded {len(vectors)} reference embeddings for {region_name}") |
|
|
|
|
|
if not prototypes: |
|
|
logger.warning("No dialect reference prototypes were built.") |
|
|
|
|
|
return embeddings, prototypes |
|
|
|
|
|
|
|
|
|
|
|
DIALECT_REF_EMBEDDINGS: Dict[str, List[np.ndarray]] = {} |
|
|
DIALECT_PROTOTYPES: Dict[str, np.ndarray] = {} |
|
|
logger.debug("Embedding model disabled - skipping reference embeddings loading") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cosine_similarity(vec_a: np.ndarray, vec_b: np.ndarray) -> float: |
|
|
denom = (np.linalg.norm(vec_a) * np.linalg.norm(vec_b)) + 1e-10 |
|
|
return float(np.dot(vec_a, vec_b) / denom) |
|
|
|
|
|
|
|
|
def predict_dialect(audio_path: str) -> Tuple[str, Dict[str, float]]: |
|
|
""" |
|
|
Predict dialect region using cosine similarity against reference prototypes. |
|
|
""" |
|
|
if not DIALECT_PROTOTYPES: |
|
|
logger.warning("No dialect prototypes available; returning fallback prediction.") |
|
|
return "Bilinmiyor", {} |
|
|
|
|
|
user_embedding = embed_audio(audio_path) |
|
|
if user_embedding is None: |
|
|
return "Bilinmiyor", {} |
|
|
|
|
|
scores: Dict[str, float] = {} |
|
|
for region_name, prototype_vec in DIALECT_PROTOTYPES.items(): |
|
|
similarity = cosine_similarity(user_embedding, prototype_vec) |
|
|
normalized = max(0.0, min(1.0, (similarity + 1) / 2)) |
|
|
scores[region_name] = round(normalized, 4) |
|
|
|
|
|
if not scores: |
|
|
return "Bilinmiyor", {} |
|
|
|
|
|
predicted_region = max(scores, key=scores.get) |
|
|
return predicted_region, scores |
|
|
|
|
|
|
|
|
def generate_reply_text(region: str) -> str: |
|
|
templates = { |
|
|
"Karadeniz": "Aaa, sen demek Karadenizlisin! Hızlı ritim ve enerjik ton hemen belli ediyor kendini. 🌊", |
|
|
"Doğu Anadolu": "Hmm, Doğu’dan bir hava aldım. Güçlü vurgular ve ağır ritim çok tanıdık. 🏔️", |
|
|
"İç Anadolu": "Sende İç Anadolu’nun sakin ve net konuşması var gibi. Rahat ve dengeli. 🚜", |
|
|
"Ege": "Ege rüzgarı gibi yumuşak tınlıyor sesin; huzur veren bir anlatım. 🌅", |
|
|
"Akdeniz": "Akdeniz’in sıcaklığı ve enerjisi var sesinde, çok hareketli! ☀️", |
|
|
"Marmara": "Oldukça dengeli ve şehirli bir ton; Marmara aksanı hissediliyor. 🌆", |
|
|
"Güneydoğu Anadolu": "Güneydoğu’nun uzun vurguları ve sıcaklığı geliyor sesinden. 🔥", |
|
|
} |
|
|
if region in templates: |
|
|
return templates[region] |
|
|
if region and region != "Bilinmiyor": |
|
|
return f"Sesinde {region} bölgesine benzeyen bir tını var. Çok hoş bir karışım yakalamışsın. 🙂" |
|
|
return "Şive tahmin edemedim ama sesin oldukça ilgi çekici!" |
|
|
|
|
|
|
|
|
def synthesize_elevenlabs( |
|
|
text: str, |
|
|
speaking_rate: Optional[float] = None, |
|
|
pitch: Optional[float] = None |
|
|
) -> Optional[str]: |
|
|
""" |
|
|
Convert reply text into speech using ElevenLabs. |
|
|
""" |
|
|
if not text: |
|
|
return None |
|
|
if not elevenlabs_client: |
|
|
logger.warning("ElevenLabs client unavailable; cannot synthesize audio.") |
|
|
return None |
|
|
|
|
|
voice_settings: Dict[str, Any] = { |
|
|
"stability": 0.4, |
|
|
"similarity_boost": 0.8, |
|
|
} |
|
|
if speaking_rate is not None: |
|
|
voice_settings["speaking_rate"] = speaking_rate |
|
|
if pitch is not None: |
|
|
voice_settings["pitch"] = pitch |
|
|
|
|
|
try: |
|
|
audio = elevenlabs_client.text_to_speech.convert( |
|
|
voice_id=ELEVENLABS_VOICE_ID, |
|
|
model_id=ELEVENLABS_MODEL_ID, |
|
|
text=text, |
|
|
voice_settings=voice_settings, |
|
|
) |
|
|
out_path = f"reply_{uuid4().hex}.wav" |
|
|
save(audio, out_path) |
|
|
return out_path |
|
|
except Exception as e: |
|
|
logger.error(f"ElevenLabs synthesis failed: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_audio(audio_data: np.ndarray, sample_rate: int) -> Tuple[np.ndarray, int]: |
|
|
""" |
|
|
Process audio: convert to mono, normalize, resample if needed. |
|
|
|
|
|
Args: |
|
|
audio_data: Audio signal as numpy array |
|
|
sample_rate: Original sample rate |
|
|
|
|
|
Returns: |
|
|
Processed audio data and sample rate |
|
|
""" |
|
|
try: |
|
|
|
|
|
if len(audio_data.shape) > 1: |
|
|
audio_data = librosa.to_mono(audio_data) |
|
|
logger.info("Converted stereo to mono") |
|
|
|
|
|
|
|
|
if sample_rate != TARGET_SAMPLE_RATE: |
|
|
audio_data = librosa.resample( |
|
|
audio_data, |
|
|
orig_sr=sample_rate, |
|
|
target_sr=TARGET_SAMPLE_RATE |
|
|
) |
|
|
sample_rate = TARGET_SAMPLE_RATE |
|
|
logger.info(f"Resampled to {TARGET_SAMPLE_RATE} Hz") |
|
|
|
|
|
|
|
|
audio_data = librosa.util.normalize(audio_data) |
|
|
|
|
|
return audio_data, sample_rate |
|
|
except Exception as e: |
|
|
logger.error(f"Error processing audio: {e}") |
|
|
raise ValueError(f"Ses işleme hatası: {e}") |
|
|
|
|
|
|
|
|
def validate_audio(audio_data: np.ndarray, sample_rate: int) -> None: |
|
|
""" |
|
|
Validate audio duration and quality. |
|
|
|
|
|
Args: |
|
|
audio_data: Audio signal |
|
|
sample_rate: Sample rate |
|
|
|
|
|
Raises: |
|
|
ValueError: If audio is invalid |
|
|
""" |
|
|
duration = len(audio_data) / sample_rate |
|
|
|
|
|
if duration < MIN_AUDIO_DURATION: |
|
|
raise ValueError( |
|
|
f"Ses süresi en az {MIN_AUDIO_DURATION} saniye olmalı. " |
|
|
f"Mevcut süre: {duration:.2f} saniye." |
|
|
) |
|
|
|
|
|
if len(audio_data) == 0: |
|
|
raise ValueError("Ses verisi boş.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_asr(audio_data: np.ndarray, sample_rate: int) -> str: |
|
|
""" |
|
|
Run Whisper ASR on audio. |
|
|
|
|
|
Args: |
|
|
audio_data: Processed audio signal |
|
|
sample_rate: Sample rate |
|
|
|
|
|
Returns: |
|
|
Transcription text |
|
|
""" |
|
|
try: |
|
|
|
|
|
audio_float = audio_data.astype(np.float32) |
|
|
|
|
|
inputs = processor( |
|
|
audio_float, |
|
|
sampling_rate=sample_rate, |
|
|
return_tensors="pt" |
|
|
) |
|
|
|
|
|
|
|
|
input_features = inputs.input_features.to(device=DEVICE, dtype=DTYPE) |
|
|
|
|
|
with torch.no_grad(): |
|
|
generated_ids = model.generate( |
|
|
input_features, |
|
|
max_length=400, |
|
|
language="tr", |
|
|
task="transcribe" |
|
|
) |
|
|
|
|
|
hypothesis = processor.batch_decode( |
|
|
generated_ids, |
|
|
skip_special_tokens=True |
|
|
)[0] |
|
|
|
|
|
logger.info(f"ASR output: {hypothesis}") |
|
|
return hypothesis |
|
|
except Exception as e: |
|
|
logger.error(f"ASR error: {e}") |
|
|
raise ValueError(f"Konuşma tanıma hatası: {e}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def vowel_shift_score(transcription: str, profile: Dict[str, Any]) -> float: |
|
|
""" |
|
|
Score vowel shifts in transcription. |
|
|
Enhanced scoring based on phonetic patterns. |
|
|
|
|
|
Args: |
|
|
transcription: ASR transcription |
|
|
profile: Dialect profile |
|
|
|
|
|
Returns: |
|
|
Vowel shift score [0, 1] |
|
|
""" |
|
|
transcription_lower = transcription.lower() |
|
|
shifts = profile["vowel_shifts"] |
|
|
total_weight = sum(shifts.values()) |
|
|
|
|
|
if total_weight == 0: |
|
|
return 0.0 |
|
|
|
|
|
score = 0.0 |
|
|
text_length = len(transcription_lower) |
|
|
|
|
|
for shift_pattern, weight in shifts.items(): |
|
|
if "→" not in shift_pattern: |
|
|
continue |
|
|
|
|
|
source, target = shift_pattern.split("→") |
|
|
|
|
|
|
|
|
|
|
|
if len(target) > 1 and target[0] == target[1]: |
|
|
|
|
|
pattern = target[0] * 2 |
|
|
count = transcription_lower.count(pattern) |
|
|
|
|
|
count += transcription_lower.count(target[0] + "ğ") |
|
|
count += transcription_lower.count(target[0] + "y") |
|
|
else: |
|
|
count = transcription_lower.count(target) |
|
|
|
|
|
|
|
|
normalized_count = count / max(text_length, 1) * 100 |
|
|
score += normalized_count * weight |
|
|
|
|
|
|
|
|
normalized_score = score / (total_weight * 10 + 1e-6) |
|
|
return min(normalized_score, 1.0) |
|
|
|
|
|
|
|
|
def marker_score(transcription: str, profile: Dict[str, Any]) -> float: |
|
|
""" |
|
|
Score lexical markers in transcription. |
|
|
|
|
|
Args: |
|
|
transcription: ASR transcription |
|
|
profile: Dialect profile |
|
|
|
|
|
Returns: |
|
|
Marker score [0, 1] |
|
|
""" |
|
|
transcription_lower = transcription.lower() |
|
|
markers = profile["markers"] |
|
|
|
|
|
if not markers: |
|
|
return 0.0 |
|
|
|
|
|
matches = sum(1 for marker in markers if marker in transcription_lower) |
|
|
|
|
|
|
|
|
score = matches / len(markers) |
|
|
|
|
|
|
|
|
total_occurrences = sum(transcription_lower.count(marker) for marker in markers) |
|
|
if total_occurrences > len(markers): |
|
|
score = min(score * 1.2, 1.0) |
|
|
|
|
|
return score |
|
|
|
|
|
|
|
|
def prosody_score( |
|
|
audio_data: np.ndarray, |
|
|
sample_rate: int, |
|
|
profile: Dict[str, Any] |
|
|
) -> float: |
|
|
""" |
|
|
Analyze prosody: tempo, pitch characteristics. |
|
|
|
|
|
Args: |
|
|
audio_data: Audio signal |
|
|
sample_rate: Sample rate |
|
|
profile: Dialect profile |
|
|
|
|
|
Returns: |
|
|
Prosody score [0, 1] |
|
|
""" |
|
|
try: |
|
|
|
|
|
audio_normalized = librosa.util.normalize(audio_data) |
|
|
|
|
|
|
|
|
tempo = float(librosa.beat.tempo(y=audio_normalized, sr=sample_rate)[0]) |
|
|
|
|
|
|
|
|
pitches, magnitudes = librosa.piptrack( |
|
|
y=audio_normalized, |
|
|
sr=sample_rate, |
|
|
threshold=0.1 |
|
|
) |
|
|
|
|
|
|
|
|
pitch_values = [] |
|
|
for t in range(pitches.shape[1]): |
|
|
index = magnitudes[:, t].argmax() |
|
|
pitch = pitches[index, t] |
|
|
if pitch > 0: |
|
|
pitch_values.append(pitch) |
|
|
|
|
|
avg_pitch = np.mean(pitch_values) if pitch_values else 0.0 |
|
|
pitch_std = np.std(pitch_values) if len(pitch_values) > 1 else 0.0 |
|
|
|
|
|
|
|
|
prosody_type = profile["prosody"] |
|
|
tempo_range = profile.get("tempo_range", (80, 120)) |
|
|
pitch_range_type = profile.get("pitch_range", "neutral") |
|
|
|
|
|
|
|
|
tempo_min, tempo_max = tempo_range |
|
|
if tempo_min <= tempo <= tempo_max: |
|
|
tempo_score = 1.0 |
|
|
else: |
|
|
|
|
|
if tempo < tempo_min: |
|
|
tempo_score = max(0.0, tempo / tempo_min) |
|
|
else: |
|
|
tempo_score = max(0.0, 1.0 - (tempo - tempo_max) / tempo_max) |
|
|
|
|
|
|
|
|
pitch_score = 0.5 |
|
|
if pitch_range_type == "high" or pitch_range_type == "high-oscillating": |
|
|
if avg_pitch > 200: |
|
|
pitch_score = 1.0 |
|
|
elif avg_pitch > 150: |
|
|
pitch_score = 0.7 |
|
|
elif pitch_range_type == "low" or pitch_range_type == "low-elongated": |
|
|
if avg_pitch < 150: |
|
|
pitch_score = 1.0 |
|
|
elif avg_pitch < 200: |
|
|
pitch_score = 0.7 |
|
|
else: |
|
|
if 150 <= avg_pitch <= 250: |
|
|
pitch_score = 1.0 |
|
|
|
|
|
|
|
|
oscillation_score = 0.5 |
|
|
if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type: |
|
|
if pitch_std > 50: |
|
|
oscillation_score = 1.0 |
|
|
elif pitch_std > 30: |
|
|
oscillation_score = 0.7 |
|
|
|
|
|
|
|
|
if "oscillating" in pitch_range_type or "inişli-çıkışlı" in prosody_type: |
|
|
final_score = (tempo_score * 0.4 + pitch_score * 0.3 + oscillation_score * 0.3) |
|
|
else: |
|
|
final_score = (tempo_score * 0.6 + pitch_score * 0.4) |
|
|
|
|
|
return min(final_score, 1.0) |
|
|
except Exception as e: |
|
|
logger.warning(f"Prosody analysis error: {e}") |
|
|
return 0.5 |
|
|
|
|
|
|
|
|
def dialect_similarity( |
|
|
transcription: str, |
|
|
audio_data: np.ndarray, |
|
|
sample_rate: int |
|
|
) -> Tuple[Dict[str, float], List[Tuple[str, float]]]: |
|
|
""" |
|
|
Calculate dialect similarity scores for all regions. |
|
|
|
|
|
Args: |
|
|
transcription: ASR transcription |
|
|
audio_data: Audio signal |
|
|
sample_rate: Sample rate |
|
|
|
|
|
Returns: |
|
|
Dictionary of scores and sorted predictions |
|
|
""" |
|
|
scores: Dict[str, float] = {} |
|
|
|
|
|
for region, profile in DIALECT_PROFILES.items(): |
|
|
try: |
|
|
vowel_score = vowel_shift_score(transcription, profile) |
|
|
marker_score_val = marker_score(transcription, profile) |
|
|
prosody_score_val = prosody_score(audio_data, sample_rate, profile) |
|
|
|
|
|
|
|
|
combined_score = ( |
|
|
vowel_score * VOWEL_SHIFT_WEIGHT + |
|
|
marker_score_val * MARKER_WEIGHT + |
|
|
prosody_score_val * PROSODY_WEIGHT |
|
|
) |
|
|
|
|
|
scores[region] = round(combined_score, 3) |
|
|
|
|
|
logger.info( |
|
|
f"{region}: vowel={vowel_score:.3f}, " |
|
|
f"marker={marker_score_val:.3f}, " |
|
|
f"prosody={prosody_score_val:.3f}, " |
|
|
f"combined={combined_score:.3f}" |
|
|
) |
|
|
except Exception as e: |
|
|
logger.error(f"Error calculating score for {region}: {e}") |
|
|
scores[region] = 0.0 |
|
|
|
|
|
|
|
|
sorted_predictions = sorted( |
|
|
scores.items(), |
|
|
key=lambda x: x[1], |
|
|
reverse=True |
|
|
) |
|
|
|
|
|
return scores, sorted_predictions |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def plot_region_heatmap( |
|
|
scores: Dict[str, float], |
|
|
highlight_region: Optional[str] = None |
|
|
) -> go.Figure: |
|
|
""" |
|
|
Create an interactive choropleth-style region heatmap for Türkiye dialect scores. |
|
|
""" |
|
|
try: |
|
|
if not scores: |
|
|
raise ValueError("Score verisi yok") |
|
|
|
|
|
df = pd.DataFrame({ |
|
|
"region_name": list(scores.keys()), |
|
|
"score": list(scores.values()), |
|
|
}) |
|
|
|
|
|
min_score = float(df["score"].min()) |
|
|
max_score = float(df["score"].max()) |
|
|
if min_score == max_score: |
|
|
max_score = min_score + 0.01 |
|
|
|
|
|
fig = px.choropleth_mapbox( |
|
|
df, |
|
|
geojson=TURKEY_REGIONS_GEOJSON, |
|
|
locations="region_name", |
|
|
featureidkey="properties.name", |
|
|
color="score", |
|
|
color_continuous_scale="OrRd", |
|
|
range_color=(min_score, max_score), |
|
|
mapbox_style="carto-positron", |
|
|
zoom=4.5, |
|
|
center={"lat": 39.0, "lon": 35.0}, |
|
|
opacity=0.7, |
|
|
labels={"score": "Benzerlik"}, |
|
|
) |
|
|
|
|
|
fig.update_traces(marker_line_width=0.5, marker_line_color="white") |
|
|
|
|
|
if highlight_region and highlight_region in df["region_name"].values: |
|
|
highlight_df = df[df["region_name"] == highlight_region] |
|
|
fig.add_choroplethmapbox( |
|
|
geojson=TURKEY_REGIONS_GEOJSON, |
|
|
locations=highlight_df["region_name"], |
|
|
z=np.ones(len(highlight_df)), |
|
|
featureidkey="properties.name", |
|
|
colorscale=[[0, "rgba(0,0,0,0)"], [1, "rgba(0,0,0,0)"]], |
|
|
showscale=False, |
|
|
marker_opacity=0, |
|
|
marker_line_width=3, |
|
|
marker_line_color="black", |
|
|
hovertext=highlight_df["region_name"], |
|
|
name="Tahmin", |
|
|
) |
|
|
|
|
|
fig.add_annotation( |
|
|
text=f"🗣 Tahmin: {highlight_region}", |
|
|
x=0.5, |
|
|
y=0.02, |
|
|
xref="paper", |
|
|
yref="paper", |
|
|
showarrow=False, |
|
|
bgcolor="white", |
|
|
bordercolor="black", |
|
|
borderwidth=1, |
|
|
font=dict(size=14), |
|
|
) |
|
|
|
|
|
fig.update_layout( |
|
|
margin=dict(l=10, r=10, t=40, b=10), |
|
|
height=600, |
|
|
coloraxis_colorbar=dict(title="Benzerlik"), |
|
|
) |
|
|
|
|
|
return fig |
|
|
except Exception as e: |
|
|
logger.error(f"Error creating heatmap: {e}") |
|
|
fig = go.Figure() |
|
|
fig.update_layout( |
|
|
title="Harita yüklenemedi", |
|
|
height=600 |
|
|
) |
|
|
return fig |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def analyze_and_reply( |
|
|
audio_path: Optional[str] |
|
|
) -> Tuple[str, str, str, Optional[str], go.Figure]: |
|
|
""" |
|
|
Full processing pipeline: audio → ASR → dialect analysis → TTS reply. |
|
|
""" |
|
|
def build_empty_fig(title: str = "Harita yüklenemedi") -> go.Figure: |
|
|
fig = go.Figure() |
|
|
fig.update_layout(title=title, height=600) |
|
|
return fig |
|
|
|
|
|
logger.info(f"Received audio_path: {audio_path}") |
|
|
|
|
|
if audio_path is None: |
|
|
logger.warning("Audio input is None.") |
|
|
empty_fig = build_empty_fig() |
|
|
return "Ses alınamadı. Lütfen tekrar deneyin.", "", "", None, empty_fig |
|
|
|
|
|
|
|
|
if not os.path.exists(audio_path): |
|
|
logger.error(f"Audio file does not exist: {audio_path}") |
|
|
empty_fig = build_empty_fig() |
|
|
return f"Ses dosyası bulunamadı: {audio_path}", "", "", None, empty_fig |
|
|
|
|
|
try: |
|
|
logger.info(f"Reading audio file: {audio_path}") |
|
|
audio_data, sample_rate = sf.read(audio_path) |
|
|
logger.info(f"Audio file read successfully. Duration: {len(audio_data)/sample_rate:.2f}s, Sample rate: {sample_rate}Hz") |
|
|
if audio_data.ndim > 1: |
|
|
audio_data = audio_data.T |
|
|
audio_data = librosa.to_mono(audio_data) |
|
|
audio_data = np.asarray(audio_data, dtype=np.float32) |
|
|
except Exception as e: |
|
|
logger.error(f"Error reading audio file: {e}") |
|
|
empty_fig = build_empty_fig() |
|
|
return f"Ses dosyası okunamadı: {e}", "", "", None, empty_fig |
|
|
|
|
|
try: |
|
|
processed_audio, processed_sr = process_audio(audio_data, sample_rate) |
|
|
validate_audio(processed_audio, processed_sr) |
|
|
except ValueError as e: |
|
|
logger.error(f"Audio validation error: {e}") |
|
|
empty_fig = build_empty_fig() |
|
|
return str(e), "", "", None, empty_fig |
|
|
|
|
|
try: |
|
|
transcript = run_asr(processed_audio, processed_sr) |
|
|
logger.info(f"ASR transcript: {transcript}") |
|
|
except ValueError as e: |
|
|
logger.error(f"ASR error: {e}") |
|
|
empty_fig = build_empty_fig() |
|
|
return str(e), "", "", None, empty_fig |
|
|
|
|
|
|
|
|
similarity_scores, sorted_predictions = dialect_similarity( |
|
|
transcript, processed_audio, processed_sr |
|
|
) |
|
|
|
|
|
|
|
|
embedding_region, embedding_scores = predict_dialect(audio_path) |
|
|
|
|
|
|
|
|
if similarity_scores and sorted_predictions and len(sorted_predictions) > 0: |
|
|
|
|
|
predicted_region = sorted_predictions[0][0] |
|
|
scores = similarity_scores |
|
|
top_score = sorted_predictions[0][1] |
|
|
logger.info(f"Using transcription-based prediction: {predicted_region} (score: {top_score:.4f})") |
|
|
|
|
|
|
|
|
if len(sorted_predictions) >= 3: |
|
|
logger.info(f"Top 3 predictions: {[(r, f'{s:.4f}') for r, s in sorted_predictions[:3]]}") |
|
|
elif embedding_scores and embedding_region != "Bilinmiyor" and max(embedding_scores.values()) > 0.01: |
|
|
|
|
|
predicted_region = embedding_region |
|
|
scores = embedding_scores |
|
|
logger.info(f"Using embedding-based prediction: {predicted_region} (score: {max(embedding_scores.values()):.4f})") |
|
|
else: |
|
|
|
|
|
if similarity_scores and sorted_predictions and len(sorted_predictions) > 0: |
|
|
predicted_region = sorted_predictions[0][0] |
|
|
scores = similarity_scores |
|
|
logger.warning(f"Using transcription-based with low scores: {predicted_region} (score: {sorted_predictions[0][1]:.4f})") |
|
|
elif similarity_scores: |
|
|
|
|
|
predicted_region = max(similarity_scores, key=similarity_scores.get) |
|
|
scores = similarity_scores |
|
|
logger.warning(f"Using first region from scores: {predicted_region}") |
|
|
else: |
|
|
|
|
|
predicted_region = list(DIALECT_PROFILES.keys())[0] if DIALECT_PROFILES else "Bilinmiyor" |
|
|
scores = {region: 0.1 for region in DIALECT_PROFILES.keys()} if DIALECT_PROFILES else {} |
|
|
logger.error(f"All prediction methods failed, using fallback: {predicted_region}") |
|
|
|
|
|
reply_text = generate_reply_text(predicted_region) |
|
|
reply_audio_path = synthesize_elevenlabs(reply_text) or None |
|
|
heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None) |
|
|
|
|
|
return ( |
|
|
transcript, |
|
|
predicted_region, |
|
|
reply_text, |
|
|
reply_audio_path, |
|
|
heatmap_fig |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
CSS = """ |
|
|
* { |
|
|
box-sizing: border-box; |
|
|
margin: 0; |
|
|
padding: 0; |
|
|
} |
|
|
|
|
|
@keyframes float { |
|
|
0%, 100% { transform: translateY(0px); } |
|
|
50% { transform: translateY(-10px); } |
|
|
} |
|
|
|
|
|
@keyframes shimmer { |
|
|
0% { background-position: -1000px 0; } |
|
|
100% { background-position: 1000px 0; } |
|
|
} |
|
|
|
|
|
@keyframes pulse { |
|
|
0%, 100% { opacity: 1; } |
|
|
50% { opacity: 0.7; } |
|
|
} |
|
|
|
|
|
body { |
|
|
background: |
|
|
radial-gradient(circle at 20% 50%, rgba(120, 119, 198, 0.15) 0%, transparent 50%), |
|
|
radial-gradient(circle at 80% 80%, rgba(255, 119, 198, 0.1) 0%, transparent 50%), |
|
|
radial-gradient(circle at 40% 20%, rgba(99, 102, 241, 0.1) 0%, transparent 50%), |
|
|
linear-gradient(135deg, #F8F9FA 0%, #E9ECEF 50%, #F1F3F5 100%) !important; |
|
|
font-family: -apple-system, BlinkMacSystemFont, "SF Pro Display", "SF Pro Text", "Segoe UI", sans-serif; |
|
|
-webkit-font-smoothing: antialiased; |
|
|
-moz-osx-font-smoothing: grayscale; |
|
|
min-height: 100vh; |
|
|
position: relative; |
|
|
overflow-x: hidden; |
|
|
} |
|
|
|
|
|
body::before { |
|
|
content: ''; |
|
|
position: fixed; |
|
|
top: 0; |
|
|
left: 0; |
|
|
right: 0; |
|
|
bottom: 0; |
|
|
background: |
|
|
radial-gradient(circle at 20% 30%, rgba(99, 102, 241, 0.08) 0%, transparent 50%), |
|
|
radial-gradient(circle at 80% 70%, rgba(168, 85, 247, 0.06) 0%, transparent 50%); |
|
|
pointer-events: none; |
|
|
z-index: 0; |
|
|
} |
|
|
|
|
|
.gradio-container { |
|
|
background: transparent !important; |
|
|
max-width: 1500px !important; |
|
|
margin: 0 auto !important; |
|
|
padding: 60px 30px !important; |
|
|
position: relative; |
|
|
z-index: 1; |
|
|
} |
|
|
|
|
|
h1 { |
|
|
font-weight: 800 !important; |
|
|
letter-spacing: -2.5px !important; |
|
|
color: #1D1D1F !important; |
|
|
margin: 0 !important; |
|
|
background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%) !important; |
|
|
background-size: 200% auto !important; |
|
|
-webkit-background-clip: text !important; |
|
|
-webkit-text-fill-color: #1D1D1F !important; |
|
|
background-clip: text !important; |
|
|
animation: shimmer 3s linear infinite !important; |
|
|
opacity: 1 !important; |
|
|
z-index: 10 !important; |
|
|
position: relative !important; |
|
|
visibility: visible !important; |
|
|
} |
|
|
|
|
|
.card { |
|
|
background: rgba(255, 255, 255, 0.85) !important; |
|
|
backdrop-filter: blur(30px) saturate(180%) !important; |
|
|
-webkit-backdrop-filter: blur(30px) saturate(180%) !important; |
|
|
padding: 28px !important; |
|
|
border-radius: 20px !important; |
|
|
border: 1px solid rgba(0, 0, 0, 0.08) !important; |
|
|
margin-bottom: 20px !important; |
|
|
box-shadow: |
|
|
0 8px 32px rgba(0, 0, 0, 0.06), |
|
|
0 4px 16px rgba(0, 0, 0, 0.04), |
|
|
0 2px 8px rgba(0, 0, 0, 0.03), |
|
|
inset 0 1px 0 rgba(255, 255, 255, 0.9) !important; |
|
|
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; |
|
|
position: relative; |
|
|
overflow: hidden; |
|
|
} |
|
|
|
|
|
.card::before { |
|
|
content: ''; |
|
|
position: absolute; |
|
|
top: 0; |
|
|
left: -100%; |
|
|
width: 100%; |
|
|
height: 100%; |
|
|
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.4), transparent); |
|
|
transition: left 0.5s; |
|
|
} |
|
|
|
|
|
.card:hover::before { |
|
|
left: 100%; |
|
|
} |
|
|
|
|
|
.card:hover { |
|
|
transform: translateY(-4px) scale(1.01) !important; |
|
|
box-shadow: |
|
|
0 28px 80px rgba(0, 0, 0, 0.12), |
|
|
0 12px 32px rgba(0, 0, 0, 0.08), |
|
|
0 4px 12px rgba(0, 0, 0, 0.06), |
|
|
inset 0 1px 0 rgba(255, 255, 255, 1), |
|
|
inset 0 -1px 0 rgba(255, 255, 255, 0.6) !important; |
|
|
border-color: rgba(255, 255, 255, 1) !important; |
|
|
} |
|
|
|
|
|
.label { |
|
|
font-weight: 700 !important; |
|
|
color: #1D1D1F !important; |
|
|
margin-bottom: 14px !important; |
|
|
font-size: 15px !important; |
|
|
letter-spacing: -0.3px !important; |
|
|
text-transform: uppercase; |
|
|
font-size: 12px; |
|
|
opacity: 0.8; |
|
|
} |
|
|
|
|
|
.textbox textarea, |
|
|
.textbox input, |
|
|
.dropdown select { |
|
|
background: rgba(255, 255, 255, 0.95) !important; |
|
|
backdrop-filter: blur(20px) saturate(180%) !important; |
|
|
-webkit-backdrop-filter: blur(20px) saturate(180%) !important; |
|
|
border: 1.5px solid rgba(0, 0, 0, 0.06) !important; |
|
|
border-radius: 16px !important; |
|
|
color: #1D1D1F !important; |
|
|
padding: 16px 20px !important; |
|
|
font-size: 15px !important; |
|
|
font-weight: 500 !important; |
|
|
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important; |
|
|
box-shadow: |
|
|
0 4px 12px rgba(0, 0, 0, 0.04), |
|
|
inset 0 1px 2px rgba(0, 0, 0, 0.02) !important; |
|
|
} |
|
|
|
|
|
.textbox:focus-within, |
|
|
.dropdown:focus-within { |
|
|
border-color: #007AFF !important; |
|
|
box-shadow: |
|
|
0 8px 24px rgba(0, 122, 255, 0.2), |
|
|
0 4px 12px rgba(0, 122, 255, 0.15), |
|
|
inset 0 1px 2px rgba(0, 122, 255, 0.1) !important; |
|
|
transform: translateY(-1px); |
|
|
} |
|
|
|
|
|
button.primary { |
|
|
background: linear-gradient(135deg, #007AFF 0%, #0051D5 50%, #007AFF 100%) !important; |
|
|
background-size: 200% auto !important; |
|
|
border: none !important; |
|
|
border-radius: 18px !important; |
|
|
font-weight: 700 !important; |
|
|
padding: 18px 40px !important; |
|
|
font-size: 17px !important; |
|
|
color: white !important; |
|
|
letter-spacing: -0.2px !important; |
|
|
box-shadow: |
|
|
0 8px 24px rgba(0, 122, 255, 0.4), |
|
|
0 4px 12px rgba(0, 122, 255, 0.3), |
|
|
inset 0 1px 0 rgba(255, 255, 255, 0.3), |
|
|
inset 0 -1px 0 rgba(0, 0, 0, 0.1) !important; |
|
|
transition: all 0.3s cubic-bezier(0.34, 1.56, 0.64, 1) !important; |
|
|
cursor: pointer !important; |
|
|
position: relative; |
|
|
overflow: hidden; |
|
|
text-transform: none !important; |
|
|
} |
|
|
|
|
|
button.primary::before { |
|
|
content: ''; |
|
|
position: absolute; |
|
|
top: 0; |
|
|
left: -100%; |
|
|
width: 100%; |
|
|
height: 100%; |
|
|
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.3), transparent); |
|
|
transition: left 0.5s; |
|
|
} |
|
|
|
|
|
button.primary:hover::before { |
|
|
left: 100%; |
|
|
} |
|
|
|
|
|
button.primary:hover { |
|
|
transform: translateY(-3px) scale(1.02) !important; |
|
|
box-shadow: |
|
|
0 12px 32px rgba(0, 122, 255, 0.5), |
|
|
0 6px 16px rgba(0, 122, 255, 0.4), |
|
|
inset 0 1px 0 rgba(255, 255, 255, 0.4), |
|
|
inset 0 -1px 0 rgba(0, 0, 0, 0.15) !important; |
|
|
background-position: right center !important; |
|
|
} |
|
|
|
|
|
button.primary:active { |
|
|
transform: translateY(-1px) scale(1.01) !important; |
|
|
box-shadow: |
|
|
0 4px 16px rgba(0, 122, 255, 0.4), |
|
|
inset 0 1px 0 rgba(255, 255, 255, 0.2) !important; |
|
|
} |
|
|
|
|
|
.json { |
|
|
font-family: "SF Mono", "Monaco", "Menlo", "Courier New", monospace !important; |
|
|
font-size: 13px !important; |
|
|
background: rgba(248, 249, 250, 0.9) !important; |
|
|
backdrop-filter: blur(20px) saturate(180%) !important; |
|
|
-webkit-backdrop-filter: blur(20px) saturate(180%) !important; |
|
|
border: 1px solid rgba(0, 0, 0, 0.05) !important; |
|
|
border-radius: 16px !important; |
|
|
padding: 24px !important; |
|
|
color: #1D1D1F !important; |
|
|
line-height: 1.7 !important; |
|
|
box-shadow: |
|
|
inset 0 2px 8px rgba(0, 0, 0, 0.03), |
|
|
inset 0 1px 2px rgba(0, 0, 0, 0.02) !important; |
|
|
} |
|
|
|
|
|
.markdown { |
|
|
color: #1D1D1F !important; |
|
|
} |
|
|
|
|
|
.markdown * { |
|
|
visibility: visible !important; |
|
|
opacity: 1 !important; |
|
|
display: block !important; |
|
|
} |
|
|
|
|
|
.markdown div { |
|
|
display: block !important; |
|
|
visibility: visible !important; |
|
|
opacity: 1 !important; |
|
|
color: inherit !important; |
|
|
} |
|
|
|
|
|
.markdown h1, .header-markdown h1, .main-title { |
|
|
color: #1D1D1F !important; |
|
|
margin-bottom: 16px !important; |
|
|
margin-top: 50px !important; |
|
|
font-size: 3.5rem !important; |
|
|
font-weight: 800 !important; |
|
|
letter-spacing: -2px !important; |
|
|
line-height: 1.2 !important; |
|
|
text-shadow: 0 2px 8px rgba(0,0,0,0.1) !important; |
|
|
-webkit-text-fill-color: #1D1D1F !important; |
|
|
background: none !important; |
|
|
background-image: none !important; |
|
|
opacity: 1 !important; |
|
|
z-index: 10 !important; |
|
|
position: relative !important; |
|
|
visibility: visible !important; |
|
|
display: block !important; |
|
|
text-align: center !important; |
|
|
} |
|
|
|
|
|
.header-markdown { |
|
|
text-align: center !important; |
|
|
} |
|
|
|
|
|
.header-markdown p { |
|
|
color: #6E6E73 !important; |
|
|
font-size: 1.15rem !important; |
|
|
margin-top: 8px !important; |
|
|
opacity: 0.9 !important; |
|
|
} |
|
|
|
|
|
.markdown h1 span { |
|
|
color: #1D1D1F !important; |
|
|
-webkit-text-fill-color: #1D1D1F !important; |
|
|
background: none !important; |
|
|
display: inline-block !important; |
|
|
} |
|
|
|
|
|
.markdown p { |
|
|
display: block !important; |
|
|
visibility: visible !important; |
|
|
opacity: 1 !important; |
|
|
color: #6E6E73 !important; |
|
|
margin: 0 !important; |
|
|
} |
|
|
|
|
|
.instruction-text { |
|
|
display: block !important; |
|
|
visibility: visible !important; |
|
|
text-align: center !important; |
|
|
margin-top: -20px !important; |
|
|
margin-bottom: 40px !important; |
|
|
color: #6E6E73 !important; |
|
|
font-size: 1.1rem !important; |
|
|
opacity: 0.9 !important; |
|
|
padding: 0 20px !important; |
|
|
} |
|
|
|
|
|
.instruction-text p { |
|
|
display: block !important; |
|
|
visibility: visible !important; |
|
|
opacity: 1 !important; |
|
|
color: #6E6E73 !important; |
|
|
margin: 0 !important; |
|
|
} |
|
|
|
|
|
.header-container { |
|
|
display: block !important; |
|
|
visibility: visible !important; |
|
|
opacity: 1 !important; |
|
|
} |
|
|
|
|
|
.header-container h1 { |
|
|
display: block !important; |
|
|
visibility: visible !important; |
|
|
opacity: 1 !important; |
|
|
} |
|
|
|
|
|
.header-container p { |
|
|
display: block !important; |
|
|
visibility: visible !important; |
|
|
opacity: 1 !important; |
|
|
} |
|
|
|
|
|
/* HTML component styles */ |
|
|
.html-component, .html-component * { |
|
|
display: block !important; |
|
|
visibility: visible !important; |
|
|
opacity: 1 !important; |
|
|
} |
|
|
|
|
|
.html-component h1, .html-component .main-title { |
|
|
color: #1D1D1F !important; |
|
|
-webkit-text-fill-color: #1D1D1F !important; |
|
|
background: none !important; |
|
|
background-image: none !important; |
|
|
display: block !important; |
|
|
} |
|
|
|
|
|
.html-component p { |
|
|
display: block !important; |
|
|
visibility: visible !important; |
|
|
color: #6E6E73 !important; |
|
|
} |
|
|
|
|
|
.markdown p { |
|
|
color: #6E6E73; |
|
|
opacity: 0.95; |
|
|
font-size: 1.15rem; |
|
|
font-weight: 400; |
|
|
line-height: 1.6; |
|
|
letter-spacing: -0.2px; |
|
|
} |
|
|
|
|
|
.audio-component { |
|
|
background: rgba(255, 255, 255, 0.95) !important; |
|
|
backdrop-filter: blur(30px) saturate(200%) !important; |
|
|
-webkit-backdrop-filter: blur(30px) saturate(200%) !important; |
|
|
border-radius: 20px !important; |
|
|
border: 1.5px solid rgba(255, 255, 255, 0.8) !important; |
|
|
padding: 20px !important; |
|
|
box-shadow: |
|
|
0 8px 24px rgba(0, 0, 0, 0.06), |
|
|
0 4px 12px rgba(0, 0, 0, 0.04), |
|
|
inset 0 1px 0 rgba(255, 255, 255, 0.9) !important; |
|
|
transition: all 0.3s ease !important; |
|
|
} |
|
|
|
|
|
.audio-component:hover { |
|
|
box-shadow: |
|
|
0 12px 32px rgba(0, 0, 0, 0.08), |
|
|
0 6px 16px rgba(0, 0, 0, 0.06), |
|
|
inset 0 1px 0 rgba(255, 255, 255, 1) !important; |
|
|
} |
|
|
|
|
|
/* Ultra smooth scrollbar */ |
|
|
::-webkit-scrollbar { |
|
|
width: 10px; |
|
|
height: 10px; |
|
|
} |
|
|
|
|
|
::-webkit-scrollbar-track { |
|
|
background: rgba(0, 0, 0, 0.02); |
|
|
border-radius: 10px; |
|
|
} |
|
|
|
|
|
::-webkit-scrollbar-thumb { |
|
|
background: linear-gradient(135deg, rgba(0, 122, 255, 0.3), rgba(0, 81, 213, 0.4)); |
|
|
border-radius: 10px; |
|
|
border: 2px solid transparent; |
|
|
background-clip: padding-box; |
|
|
} |
|
|
|
|
|
::-webkit-scrollbar-thumb:hover { |
|
|
background: linear-gradient(135deg, rgba(0, 122, 255, 0.5), rgba(0, 81, 213, 0.6)); |
|
|
background-clip: padding-box; |
|
|
} |
|
|
|
|
|
/* Loading animation */ |
|
|
@keyframes spin { |
|
|
from { transform: rotate(0deg); } |
|
|
to { transform: rotate(360deg); } |
|
|
} |
|
|
|
|
|
/* Enhanced focus states */ |
|
|
*:focus-visible { |
|
|
outline: 2px solid #007AFF; |
|
|
outline-offset: 2px; |
|
|
border-radius: 4px; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
def build_ui() -> gr.Blocks: |
|
|
""" |
|
|
Build Gradio UI with Apple minimal white + smooth glass design. |
|
|
|
|
|
Returns: |
|
|
Gradio Blocks interface |
|
|
""" |
|
|
with gr.Blocks( |
|
|
css=CSS, |
|
|
fill_height=True, |
|
|
theme=gr.themes.Soft() |
|
|
) as demo: |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
# 🇹🇷 Dialect Intelligence Engine |
|
|
|
|
|
Powered by Meta Omnilingual ASR & Whisper Large-v3 |
|
|
""", |
|
|
elem_classes="header-markdown" |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin. |
|
|
""", |
|
|
elem_classes="instruction-text" |
|
|
) |
|
|
|
|
|
with gr.Row(equal_height=False): |
|
|
with gr.Column(scale=1, min_width=400): |
|
|
audio_input = gr.Audio( |
|
|
sources=["microphone", "upload"], |
|
|
type="filepath", |
|
|
label="🎤 Mikrofona basın, konuşun, kaydı durdurun", |
|
|
show_label=True, |
|
|
interactive=True, |
|
|
elem_classes="card" |
|
|
) |
|
|
analyze_button = gr.Button( |
|
|
"🔍 Analiz Et ve Şive Tahmini Yap", |
|
|
variant="primary", |
|
|
elem_classes="primary", |
|
|
visible=True, |
|
|
scale=1 |
|
|
) |
|
|
gr.Markdown( |
|
|
"📝 Ses kaydını tamamladıktan sonra butona tıklayın", |
|
|
elem_classes="instruction-text" |
|
|
) |
|
|
|
|
|
with gr.Column(scale=2, min_width=600): |
|
|
transcript_output = gr.Textbox( |
|
|
label="Transcript", |
|
|
lines=4, |
|
|
interactive=False, |
|
|
placeholder="Konuşmanı bekliyorum...", |
|
|
elem_classes="card" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
predicted_dialect = gr.Textbox( |
|
|
label="Tahmin Edilen Bölge", |
|
|
interactive=False, |
|
|
lines=2, |
|
|
elem_classes="card" |
|
|
) |
|
|
reply_text_output = gr.Textbox( |
|
|
label="Model Cevabı (Metin)", |
|
|
interactive=False, |
|
|
lines=2, |
|
|
elem_classes="card" |
|
|
) |
|
|
|
|
|
reply_audio_output = gr.Audio( |
|
|
label="Model Cevabı (Ses)", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
autoplay=True, |
|
|
elem_classes="card" |
|
|
) |
|
|
|
|
|
region_map = gr.Plot( |
|
|
label="Bölgesel Harita Isı Dağılımı", |
|
|
elem_classes="card" |
|
|
) |
|
|
|
|
|
def build_empty_fig_ui(): |
|
|
"""Build empty figure for UI""" |
|
|
fig = go.Figure() |
|
|
fig.update_layout(title="Harita yüklenemedi", height=600) |
|
|
return fig |
|
|
|
|
|
def analyze_and_reply_with_autoplay(audio_path): |
|
|
"""Wrapper to ensure audio autoplays after generation""" |
|
|
logger.info(f"analyze_and_reply_with_autoplay called with audio_path: {audio_path}") |
|
|
if audio_path is None: |
|
|
logger.warning("audio_path is None in wrapper") |
|
|
empty_fig = build_empty_fig_ui() |
|
|
return "", "", "", None, empty_fig |
|
|
result = analyze_and_reply(audio_path) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
analyze_button.click( |
|
|
fn=analyze_and_reply_with_autoplay, |
|
|
inputs=audio_input, |
|
|
outputs=[ |
|
|
transcript_output, |
|
|
predicted_dialect, |
|
|
reply_text_output, |
|
|
reply_audio_output, |
|
|
region_map |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
audio_input.change( |
|
|
fn=analyze_and_reply_with_autoplay, |
|
|
inputs=audio_input, |
|
|
outputs=[ |
|
|
transcript_output, |
|
|
predicted_dialect, |
|
|
reply_text_output, |
|
|
reply_audio_output, |
|
|
region_map |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
demo.load( |
|
|
fn=None, |
|
|
js=""" |
|
|
function() { |
|
|
// Auto-play audio when it's updated |
|
|
const observer = new MutationObserver(function(mutations) { |
|
|
mutations.forEach(function(mutation) { |
|
|
mutation.addedNodes.forEach(function(node) { |
|
|
if (node.nodeType === 1) { |
|
|
const audio = node.querySelector('audio'); |
|
|
if (audio && audio.src && !audio.hasAttribute('data-autoplayed')) { |
|
|
audio.setAttribute('data-autoplayed', 'true'); |
|
|
audio.play().catch(e => console.log('Autoplay prevented:', e)); |
|
|
} |
|
|
} |
|
|
}); |
|
|
}); |
|
|
}); |
|
|
|
|
|
observer.observe(document.body, { |
|
|
childList: true, |
|
|
subtree: true |
|
|
}); |
|
|
} |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
reply_audio_output.change( |
|
|
fn=None, |
|
|
inputs=None, |
|
|
outputs=None, |
|
|
js=""" |
|
|
function() { |
|
|
setTimeout(function() { |
|
|
// Find the audio element by looking for the reply audio component |
|
|
const labels = Array.from(document.querySelectorAll('label')); |
|
|
const replyLabel = labels.find(label => |
|
|
label.textContent && label.textContent.includes('Model Cevabı (Ses)') |
|
|
); |
|
|
|
|
|
if (replyLabel) { |
|
|
const audioContainer = replyLabel.closest('.card') || replyLabel.parentElement; |
|
|
const audioElement = audioContainer ? audioContainer.querySelector('audio') : null; |
|
|
|
|
|
if (audioElement && audioElement.src) { |
|
|
// Reset and play |
|
|
audioElement.currentTime = 0; |
|
|
const playPromise = audioElement.play(); |
|
|
if (playPromise !== undefined) { |
|
|
playPromise.catch(function(error) { |
|
|
console.log('Autoplay prevented by browser:', error); |
|
|
}); |
|
|
} |
|
|
} |
|
|
} |
|
|
}, 800); // Wait for audio to be fully loaded |
|
|
return []; |
|
|
} |
|
|
""" |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = build_ui() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860, |
|
|
share=False, |
|
|
ssr_mode=False |
|
|
) |
|
|
|
|
|
|
|
|
|