update 11labs
Browse files
app.py
CHANGED
|
@@ -1,10 +1,3 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Dialect Intelligence Engine
|
| 3 |
-
Author: Nisa Çayır
|
| 4 |
-
Core: Meta Omnilingual ASR + Whisper v3 + Türkçe Bölgesel Fonetik Analiz + ElevenLabs AI Dialog
|
| 5 |
-
Focus: Turkish dialects: vowel shifts, markers, prosody, phonetic signatures
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
# =========================================
|
| 9 |
# ENV FIXES
|
| 10 |
# =========================================
|
|
@@ -73,11 +66,11 @@ logger.info(f"Using device: {DEVICE}, dtype: {DTYPE}")
|
|
| 73 |
# MODEL INITIALIZATION
|
| 74 |
# =========================================
|
| 75 |
try:
|
| 76 |
-
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
)
|
| 82 |
|
| 83 |
model = model.to(DEVICE)
|
|
@@ -445,10 +438,10 @@ def run_asr(audio_data: np.ndarray, sample_rate: int) -> str:
|
|
| 445 |
# Ensure audio is float32 (Whisper expects fp32 input)
|
| 446 |
audio_float = audio_data.astype(np.float32)
|
| 447 |
|
| 448 |
-
|
| 449 |
audio_float,
|
| 450 |
sampling_rate=sample_rate,
|
| 451 |
-
|
| 452 |
)
|
| 453 |
|
| 454 |
# Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU)
|
|
@@ -682,7 +675,7 @@ def dialect_similarity(
|
|
| 682 |
|
| 683 |
scores[region] = round(combined_score, 3)
|
| 684 |
|
| 685 |
-
logger.
|
| 686 |
f"{region}: vowel={vowel_score:.3f}, "
|
| 687 |
f"marker={marker_score_val:.3f}, "
|
| 688 |
f"prosody={prosody_score_val:.3f}, "
|
|
@@ -835,7 +828,47 @@ def analyze_and_reply(
|
|
| 835 |
empty_fig = build_empty_fig()
|
| 836 |
return str(e), "", "", None, empty_fig
|
| 837 |
|
| 838 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
reply_text = generate_reply_text(predicted_region)
|
| 840 |
reply_audio_path = synthesize_elevenlabs(reply_text) or None
|
| 841 |
heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None)
|
|
@@ -912,33 +945,36 @@ body::before {
|
|
| 912 |
}
|
| 913 |
|
| 914 |
h1 {
|
| 915 |
-
font-weight: 800;
|
| 916 |
-
letter-spacing: -2.5px;
|
| 917 |
-
color: #1D1D1F;
|
| 918 |
-
margin: 0;
|
| 919 |
-
background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%);
|
| 920 |
-
background-size: 200% auto;
|
| 921 |
-
-webkit-background-clip: text;
|
| 922 |
-
-webkit-text-fill-color:
|
| 923 |
-
background-clip: text;
|
| 924 |
-
animation: shimmer 3s linear infinite;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 925 |
}
|
| 926 |
|
| 927 |
.card {
|
| 928 |
-
background: rgba(255, 255, 255, 0.
|
| 929 |
-
backdrop-filter: blur(
|
| 930 |
-
-webkit-backdrop-filter: blur(
|
| 931 |
-
padding:
|
| 932 |
-
border-radius:
|
| 933 |
-
border: 1px solid rgba(
|
| 934 |
-
margin-bottom:
|
| 935 |
box-shadow:
|
| 936 |
-
0
|
| 937 |
-
0
|
| 938 |
-
0 2px 8px rgba(0, 0, 0, 0.
|
| 939 |
-
inset 0 1px 0 rgba(255, 255, 255, 0.
|
| 940 |
-
|
| 941 |
-
transition: all 0.4s cubic-bezier(0.34, 1.56, 0.64, 1) !important;
|
| 942 |
position: relative;
|
| 943 |
overflow: hidden;
|
| 944 |
}
|
|
@@ -1082,19 +1118,20 @@ button.primary:active {
|
|
| 1082 |
color: #1D1D1F;
|
| 1083 |
}
|
| 1084 |
|
| 1085 |
-
.markdown h1 {
|
| 1086 |
-
color: #1D1D1F;
|
| 1087 |
-
margin-bottom:
|
| 1088 |
-
font-size: 3.5rem;
|
| 1089 |
-
font-weight: 800;
|
| 1090 |
-
letter-spacing: -
|
| 1091 |
-
line-height: 1.
|
| 1092 |
-
|
| 1093 |
-
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
|
|
|
| 1098 |
}
|
| 1099 |
|
| 1100 |
.markdown p {
|
|
@@ -1180,22 +1217,23 @@ def build_ui() -> gr.Blocks:
|
|
| 1180 |
|
| 1181 |
gr.Markdown(
|
| 1182 |
"""
|
| 1183 |
-
<div style="text-align:center; margin:
|
| 1184 |
-
<h1 style="font-size:
|
| 1185 |
-
<span style="
|
| 1186 |
</h1>
|
| 1187 |
-
<p style="color: #6E6E73; font-size:1.
|
| 1188 |
Powered by Meta Omnilingual ASR & Whisper Large-v3
|
| 1189 |
</p>
|
| 1190 |
-
|
| 1191 |
-
"""
|
|
|
|
| 1192 |
)
|
| 1193 |
|
| 1194 |
gr.Markdown(
|
| 1195 |
"""
|
| 1196 |
<div style="text-align:center; margin-top:-20px; margin-bottom:40px; color:#6E6E73;">
|
| 1197 |
Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin.
|
| 1198 |
-
|
| 1199 |
"""
|
| 1200 |
)
|
| 1201 |
|
|
@@ -1238,6 +1276,7 @@ def build_ui() -> gr.Blocks:
|
|
| 1238 |
label="Model Cevabı (Ses)",
|
| 1239 |
type="filepath",
|
| 1240 |
interactive=False,
|
|
|
|
| 1241 |
elem_classes="card"
|
| 1242 |
)
|
| 1243 |
|
|
@@ -1246,8 +1285,14 @@ def build_ui() -> gr.Blocks:
|
|
| 1246 |
elem_classes="card"
|
| 1247 |
)
|
| 1248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1249 |
audio_input.change(
|
| 1250 |
-
fn=
|
| 1251 |
inputs=audio_input,
|
| 1252 |
outputs=[
|
| 1253 |
transcript_output,
|
|
@@ -1257,6 +1302,69 @@ def build_ui() -> gr.Blocks:
|
|
| 1257 |
region_map
|
| 1258 |
]
|
| 1259 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1260 |
|
| 1261 |
return demo
|
| 1262 |
|
|
@@ -1275,4 +1383,3 @@ if __name__ == "__main__":
|
|
| 1275 |
)
|
| 1276 |
|
| 1277 |
|
| 1278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# =========================================
|
| 2 |
# ENV FIXES
|
| 3 |
# =========================================
|
|
|
|
| 66 |
# MODEL INITIALIZATION
|
| 67 |
# =========================================
|
| 68 |
try:
|
| 69 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID)
|
| 70 |
|
| 71 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 72 |
+
MODEL_ID,
|
| 73 |
+
torch_dtype=DTYPE
|
| 74 |
)
|
| 75 |
|
| 76 |
model = model.to(DEVICE)
|
|
|
|
| 438 |
# Ensure audio is float32 (Whisper expects fp32 input)
|
| 439 |
audio_float = audio_data.astype(np.float32)
|
| 440 |
|
| 441 |
+
inputs = processor(
|
| 442 |
audio_float,
|
| 443 |
sampling_rate=sample_rate,
|
| 444 |
+
return_tensors="pt"
|
| 445 |
)
|
| 446 |
|
| 447 |
# Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU)
|
|
|
|
| 675 |
|
| 676 |
scores[region] = round(combined_score, 3)
|
| 677 |
|
| 678 |
+
logger.info(
|
| 679 |
f"{region}: vowel={vowel_score:.3f}, "
|
| 680 |
f"marker={marker_score_val:.3f}, "
|
| 681 |
f"prosody={prosody_score_val:.3f}, "
|
|
|
|
| 828 |
empty_fig = build_empty_fig()
|
| 829 |
return str(e), "", "", None, empty_fig
|
| 830 |
|
| 831 |
+
# Use transcription-based dialect similarity analysis
|
| 832 |
+
similarity_scores, sorted_predictions = dialect_similarity(
|
| 833 |
+
transcript, processed_audio, processed_sr
|
| 834 |
+
)
|
| 835 |
+
|
| 836 |
+
# Also try embedding-based prediction as fallback
|
| 837 |
+
embedding_region, embedding_scores = predict_dialect(audio_path)
|
| 838 |
+
|
| 839 |
+
# Always use transcription-based prediction if available (it should always work)
|
| 840 |
+
if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
|
| 841 |
+
# Use transcription-based prediction
|
| 842 |
+
predicted_region = sorted_predictions[0][0]
|
| 843 |
+
scores = similarity_scores
|
| 844 |
+
top_score = sorted_predictions[0][1]
|
| 845 |
+
logger.info(f"Using transcription-based prediction: {predicted_region} (score: {top_score:.4f})")
|
| 846 |
+
|
| 847 |
+
# Log top 3 predictions for debugging
|
| 848 |
+
if len(sorted_predictions) >= 3:
|
| 849 |
+
logger.info(f"Top 3 predictions: {[(r, f'{s:.4f}') for r, s in sorted_predictions[:3]]}")
|
| 850 |
+
elif embedding_scores and embedding_region != "Bilinmiyor" and max(embedding_scores.values()) > 0.01:
|
| 851 |
+
# Fallback to embedding-based
|
| 852 |
+
predicted_region = embedding_region
|
| 853 |
+
scores = embedding_scores
|
| 854 |
+
logger.info(f"Using embedding-based prediction: {predicted_region} (score: {max(embedding_scores.values()):.4f})")
|
| 855 |
+
else:
|
| 856 |
+
# Last resort: ensure we always return a region
|
| 857 |
+
if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
|
| 858 |
+
predicted_region = sorted_predictions[0][0]
|
| 859 |
+
scores = similarity_scores
|
| 860 |
+
logger.warning(f"Using transcription-based with low scores: {predicted_region} (score: {sorted_predictions[0][1]:.4f})")
|
| 861 |
+
elif similarity_scores:
|
| 862 |
+
# Use first region from scores even if sorted_predictions is empty
|
| 863 |
+
predicted_region = max(similarity_scores, key=similarity_scores.get)
|
| 864 |
+
scores = similarity_scores
|
| 865 |
+
logger.warning(f"Using first region from scores: {predicted_region}")
|
| 866 |
+
else:
|
| 867 |
+
# Absolute last resort: use first region from DIALECT_PROFILES
|
| 868 |
+
predicted_region = list(DIALECT_PROFILES.keys())[0] if DIALECT_PROFILES else "Bilinmiyor"
|
| 869 |
+
scores = {region: 0.1 for region in DIALECT_PROFILES.keys()} if DIALECT_PROFILES else {}
|
| 870 |
+
logger.error(f"All prediction methods failed, using fallback: {predicted_region}")
|
| 871 |
+
|
| 872 |
reply_text = generate_reply_text(predicted_region)
|
| 873 |
reply_audio_path = synthesize_elevenlabs(reply_text) or None
|
| 874 |
heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None)
|
|
|
|
| 945 |
}
|
| 946 |
|
| 947 |
h1 {
|
| 948 |
+
font-weight: 800 !important;
|
| 949 |
+
letter-spacing: -2.5px !important;
|
| 950 |
+
color: #1D1D1F !important;
|
| 951 |
+
margin: 0 !important;
|
| 952 |
+
background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%) !important;
|
| 953 |
+
background-size: 200% auto !important;
|
| 954 |
+
-webkit-background-clip: text !important;
|
| 955 |
+
-webkit-text-fill-color: #1D1D1F !important;
|
| 956 |
+
background-clip: text !important;
|
| 957 |
+
animation: shimmer 3s linear infinite !important;
|
| 958 |
+
opacity: 1 !important;
|
| 959 |
+
z-index: 10 !important;
|
| 960 |
+
position: relative !important;
|
| 961 |
+
visibility: visible !important;
|
| 962 |
}
|
| 963 |
|
| 964 |
.card {
|
| 965 |
+
background: rgba(255, 255, 255, 0.85) !important;
|
| 966 |
+
backdrop-filter: blur(30px) saturate(180%) !important;
|
| 967 |
+
-webkit-backdrop-filter: blur(30px) saturate(180%) !important;
|
| 968 |
+
padding: 28px !important;
|
| 969 |
+
border-radius: 20px !important;
|
| 970 |
+
border: 1px solid rgba(0, 0, 0, 0.08) !important;
|
| 971 |
+
margin-bottom: 20px !important;
|
| 972 |
box-shadow:
|
| 973 |
+
0 8px 32px rgba(0, 0, 0, 0.06),
|
| 974 |
+
0 4px 16px rgba(0, 0, 0, 0.04),
|
| 975 |
+
0 2px 8px rgba(0, 0, 0, 0.03),
|
| 976 |
+
inset 0 1px 0 rgba(255, 255, 255, 0.9) !important;
|
| 977 |
+
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
|
|
|
| 978 |
position: relative;
|
| 979 |
overflow: hidden;
|
| 980 |
}
|
|
|
|
| 1118 |
color: #1D1D1F;
|
| 1119 |
}
|
| 1120 |
|
| 1121 |
+
.markdown h1, .header-markdown h1, .main-title {
|
| 1122 |
+
color: #1D1D1F !important;
|
| 1123 |
+
margin-bottom: 16px !important;
|
| 1124 |
+
font-size: 3.5rem !important;
|
| 1125 |
+
font-weight: 800 !important;
|
| 1126 |
+
letter-spacing: -2px !important;
|
| 1127 |
+
line-height: 1.2 !important;
|
| 1128 |
+
text-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
|
| 1129 |
+
-webkit-text-fill-color: #1D1D1F !important;
|
| 1130 |
+
background: none !important;
|
| 1131 |
+
opacity: 1 !important;
|
| 1132 |
+
z-index: 10 !important;
|
| 1133 |
+
position: relative !important;
|
| 1134 |
+
visibility: visible !important;
|
| 1135 |
}
|
| 1136 |
|
| 1137 |
.markdown p {
|
|
|
|
| 1217 |
|
| 1218 |
gr.Markdown(
|
| 1219 |
"""
|
| 1220 |
+
<div class="header-container" style="text-align:center; margin:50px 0 40px 0; padding: 0 20px; position: relative; z-index: 10;">
|
| 1221 |
+
<h1 class="main-title" style="font-size:3.5rem; font-weight:800; letter-spacing:-2px; margin-bottom:16px; line-height:1.2; color: #1D1D1F !important; opacity: 1 !important; visibility: visible !important; position: relative; z-index: 10; text-shadow: 0 2px 8px rgba(0,0,0,0.1);">
|
| 1222 |
+
<span style="color: #1D1D1F; font-size: 3.5rem; display: inline-block;">🇹🇷</span> Dialect Intelligence Engine
|
| 1223 |
</h1>
|
| 1224 |
+
<p style="color: #6E6E73; font-size:1.15rem; font-weight:400; letter-spacing:-0.2px; opacity:0.9; margin-top:8px;">
|
| 1225 |
Powered by Meta Omnilingual ASR & Whisper Large-v3
|
| 1226 |
</p>
|
| 1227 |
+
</div>
|
| 1228 |
+
""",
|
| 1229 |
+
elem_classes="header-markdown"
|
| 1230 |
)
|
| 1231 |
|
| 1232 |
gr.Markdown(
|
| 1233 |
"""
|
| 1234 |
<div style="text-align:center; margin-top:-20px; margin-bottom:40px; color:#6E6E73;">
|
| 1235 |
Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin.
|
| 1236 |
+
</div>
|
| 1237 |
"""
|
| 1238 |
)
|
| 1239 |
|
|
|
|
| 1276 |
label="Model Cevabı (Ses)",
|
| 1277 |
type="filepath",
|
| 1278 |
interactive=False,
|
| 1279 |
+
autoplay=True,
|
| 1280 |
elem_classes="card"
|
| 1281 |
)
|
| 1282 |
|
|
|
|
| 1285 |
elem_classes="card"
|
| 1286 |
)
|
| 1287 |
|
| 1288 |
+
def analyze_and_reply_with_autoplay(audio_path):
|
| 1289 |
+
"""Wrapper to ensure audio autoplays after generation"""
|
| 1290 |
+
result = analyze_and_reply(audio_path)
|
| 1291 |
+
# Return result - Gradio will handle autoplay if autoplay=True is set
|
| 1292 |
+
return result
|
| 1293 |
+
|
| 1294 |
audio_input.change(
|
| 1295 |
+
fn=analyze_and_reply_with_autoplay,
|
| 1296 |
inputs=audio_input,
|
| 1297 |
outputs=[
|
| 1298 |
transcript_output,
|
|
|
|
| 1302 |
region_map
|
| 1303 |
]
|
| 1304 |
)
|
| 1305 |
+
|
| 1306 |
+
# Add JavaScript for autoplay
|
| 1307 |
+
demo.load(
|
| 1308 |
+
fn=None,
|
| 1309 |
+
js="""
|
| 1310 |
+
function() {
|
| 1311 |
+
// Auto-play audio when it's updated
|
| 1312 |
+
const observer = new MutationObserver(function(mutations) {
|
| 1313 |
+
mutations.forEach(function(mutation) {
|
| 1314 |
+
mutation.addedNodes.forEach(function(node) {
|
| 1315 |
+
if (node.nodeType === 1) {
|
| 1316 |
+
const audio = node.querySelector('audio');
|
| 1317 |
+
if (audio && audio.src && !audio.hasAttribute('data-autoplayed')) {
|
| 1318 |
+
audio.setAttribute('data-autoplayed', 'true');
|
| 1319 |
+
audio.play().catch(e => console.log('Autoplay prevented:', e));
|
| 1320 |
+
}
|
| 1321 |
+
}
|
| 1322 |
+
});
|
| 1323 |
+
});
|
| 1324 |
+
});
|
| 1325 |
+
|
| 1326 |
+
observer.observe(document.body, {
|
| 1327 |
+
childList: true,
|
| 1328 |
+
subtree: true
|
| 1329 |
+
});
|
| 1330 |
+
}
|
| 1331 |
+
"""
|
| 1332 |
+
)
|
| 1333 |
+
|
| 1334 |
+
# Auto-play audio when it's generated using JavaScript callback
|
| 1335 |
+
reply_audio_output.change(
|
| 1336 |
+
fn=None,
|
| 1337 |
+
inputs=None,
|
| 1338 |
+
outputs=None,
|
| 1339 |
+
js="""
|
| 1340 |
+
function() {
|
| 1341 |
+
setTimeout(function() {
|
| 1342 |
+
// Find the audio element by looking for the reply audio component
|
| 1343 |
+
const labels = Array.from(document.querySelectorAll('label'));
|
| 1344 |
+
const replyLabel = labels.find(label =>
|
| 1345 |
+
label.textContent && label.textContent.includes('Model Cevabı (Ses)')
|
| 1346 |
+
);
|
| 1347 |
+
|
| 1348 |
+
if (replyLabel) {
|
| 1349 |
+
const audioContainer = replyLabel.closest('.card') || replyLabel.parentElement;
|
| 1350 |
+
const audioElement = audioContainer ? audioContainer.querySelector('audio') : null;
|
| 1351 |
+
|
| 1352 |
+
if (audioElement && audioElement.src) {
|
| 1353 |
+
// Reset and play
|
| 1354 |
+
audioElement.currentTime = 0;
|
| 1355 |
+
const playPromise = audioElement.play();
|
| 1356 |
+
if (playPromise !== undefined) {
|
| 1357 |
+
playPromise.catch(function(error) {
|
| 1358 |
+
console.log('Autoplay prevented by browser:', error);
|
| 1359 |
+
});
|
| 1360 |
+
}
|
| 1361 |
+
}
|
| 1362 |
+
}
|
| 1363 |
+
}, 800); // Wait for audio to be fully loaded
|
| 1364 |
+
return [];
|
| 1365 |
+
}
|
| 1366 |
+
"""
|
| 1367 |
+
)
|
| 1368 |
|
| 1369 |
return demo
|
| 1370 |
|
|
|
|
| 1383 |
)
|
| 1384 |
|
| 1385 |
|
|
|