nisacayir commited on
Commit
3d3e0f1
·
verified ·
1 Parent(s): b74a45a

update 11labs

Browse files
Files changed (1) hide show
  1. app.py +167 -60
app.py CHANGED
@@ -1,10 +1,3 @@
1
- """
2
- Dialect Intelligence Engine
3
- Author: Nisa Çayır
4
- Core: Meta Omnilingual ASR + Whisper v3 + Türkçe Bölgesel Fonetik Analiz + ElevenLabs AI Dialog
5
- Focus: Turkish dialects: vowel shifts, markers, prosody, phonetic signatures
6
- """
7
-
8
  # =========================================
9
  # ENV FIXES
10
  # =========================================
@@ -73,11 +66,11 @@ logger.info(f"Using device: {DEVICE}, dtype: {DTYPE}")
73
  # MODEL INITIALIZATION
74
  # =========================================
75
  try:
76
- processor = AutoProcessor.from_pretrained(MODEL_ID)
77
 
78
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
79
- MODEL_ID,
80
- torch_dtype=DTYPE
81
  )
82
 
83
  model = model.to(DEVICE)
@@ -445,10 +438,10 @@ def run_asr(audio_data: np.ndarray, sample_rate: int) -> str:
445
  # Ensure audio is float32 (Whisper expects fp32 input)
446
  audio_float = audio_data.astype(np.float32)
447
 
448
- inputs = processor(
449
  audio_float,
450
  sampling_rate=sample_rate,
451
- return_tensors="pt"
452
  )
453
 
454
  # Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU)
@@ -682,7 +675,7 @@ def dialect_similarity(
682
 
683
  scores[region] = round(combined_score, 3)
684
 
685
- logger.debug(
686
  f"{region}: vowel={vowel_score:.3f}, "
687
  f"marker={marker_score_val:.3f}, "
688
  f"prosody={prosody_score_val:.3f}, "
@@ -835,7 +828,47 @@ def analyze_and_reply(
835
  empty_fig = build_empty_fig()
836
  return str(e), "", "", None, empty_fig
837
 
838
- predicted_region, scores = predict_dialect(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
  reply_text = generate_reply_text(predicted_region)
840
  reply_audio_path = synthesize_elevenlabs(reply_text) or None
841
  heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None)
@@ -912,33 +945,36 @@ body::before {
912
  }
913
 
914
  h1 {
915
- font-weight: 800;
916
- letter-spacing: -2.5px;
917
- color: #1D1D1F;
918
- margin: 0;
919
- background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%);
920
- background-size: 200% auto;
921
- -webkit-background-clip: text;
922
- -webkit-text-fill-color: transparent;
923
- background-clip: text;
924
- animation: shimmer 3s linear infinite;
 
 
 
 
925
  }
926
 
927
  .card {
928
- background: rgba(255, 255, 255, 0.7) !important;
929
- backdrop-filter: blur(40px) saturate(200%) !important;
930
- -webkit-backdrop-filter: blur(40px) saturate(200%) !important;
931
- padding: 32px !important;
932
- border-radius: 24px !important;
933
- border: 1px solid rgba(255, 255, 255, 0.8) !important;
934
- margin-bottom: 24px !important;
935
  box-shadow:
936
- 0 20px 60px rgba(0, 0, 0, 0.08),
937
- 0 8px 24px rgba(0, 0, 0, 0.06),
938
- 0 2px 8px rgba(0, 0, 0, 0.04),
939
- inset 0 1px 0 rgba(255, 255, 255, 0.95),
940
- inset 0 -1px 0 rgba(255, 255, 255, 0.5) !important;
941
- transition: all 0.4s cubic-bezier(0.34, 1.56, 0.64, 1) !important;
942
  position: relative;
943
  overflow: hidden;
944
  }
@@ -1082,19 +1118,20 @@ button.primary:active {
1082
  color: #1D1D1F;
1083
  }
1084
 
1085
- .markdown h1 {
1086
- color: #1D1D1F;
1087
- margin-bottom: 20px;
1088
- font-size: 3.5rem;
1089
- font-weight: 800;
1090
- letter-spacing: -3px;
1091
- line-height: 1.1;
1092
- background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%);
1093
- background-size: 200% auto;
1094
- -webkit-background-clip: text;
1095
- -webkit-text-fill-color: transparent;
1096
- background-clip: text;
1097
- animation: shimmer 4s linear infinite;
 
1098
  }
1099
 
1100
  .markdown p {
@@ -1180,22 +1217,23 @@ def build_ui() -> gr.Blocks:
1180
 
1181
  gr.Markdown(
1182
  """
1183
- <div style="text-align:center; margin:80px 0 60px 0; padding: 0 20px;">
1184
- <h1 style="font-size:4rem; font-weight:800; letter-spacing:-3px; margin-bottom:20px; line-height:1.1; background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%); background-size: 200% auto; -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; animation: shimmer 4s linear infinite;">
1185
- <span style="background: none; -webkit-text-fill-color: initial; color: #1D1D1F;">🇹🇷</span> Dialect Intelligence Engine
1186
  </h1>
1187
- <p style="color: #6E6E73; font-size:1.25rem; font-weight:400; letter-spacing:-0.3px; opacity:0.95; margin-top:12px;">
1188
  Powered by Meta Omnilingual ASR & Whisper Large-v3
1189
  </p>
1190
- </div>
1191
- """
 
1192
  )
1193
 
1194
  gr.Markdown(
1195
  """
1196
  <div style="text-align:center; margin-top:-20px; margin-bottom:40px; color:#6E6E73;">
1197
  Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin.
1198
- </div>
1199
  """
1200
  )
1201
 
@@ -1238,6 +1276,7 @@ def build_ui() -> gr.Blocks:
1238
  label="Model Cevabı (Ses)",
1239
  type="filepath",
1240
  interactive=False,
 
1241
  elem_classes="card"
1242
  )
1243
 
@@ -1246,8 +1285,14 @@ def build_ui() -> gr.Blocks:
1246
  elem_classes="card"
1247
  )
1248
 
 
 
 
 
 
 
1249
  audio_input.change(
1250
- fn=analyze_and_reply,
1251
  inputs=audio_input,
1252
  outputs=[
1253
  transcript_output,
@@ -1257,6 +1302,69 @@ def build_ui() -> gr.Blocks:
1257
  region_map
1258
  ]
1259
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1260
 
1261
  return demo
1262
 
@@ -1275,4 +1383,3 @@ if __name__ == "__main__":
1275
  )
1276
 
1277
 
1278
-
 
 
 
 
 
 
 
 
1
  # =========================================
2
  # ENV FIXES
3
  # =========================================
 
66
  # MODEL INITIALIZATION
67
  # =========================================
68
  try:
69
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
70
 
71
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
72
+ MODEL_ID,
73
+ torch_dtype=DTYPE
74
  )
75
 
76
  model = model.to(DEVICE)
 
438
  # Ensure audio is float32 (Whisper expects fp32 input)
439
  audio_float = audio_data.astype(np.float32)
440
 
441
+ inputs = processor(
442
  audio_float,
443
  sampling_rate=sample_rate,
444
+ return_tensors="pt"
445
  )
446
 
447
  # Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU)
 
675
 
676
  scores[region] = round(combined_score, 3)
677
 
678
+ logger.info(
679
  f"{region}: vowel={vowel_score:.3f}, "
680
  f"marker={marker_score_val:.3f}, "
681
  f"prosody={prosody_score_val:.3f}, "
 
828
  empty_fig = build_empty_fig()
829
  return str(e), "", "", None, empty_fig
830
 
831
+ # Use transcription-based dialect similarity analysis
832
+ similarity_scores, sorted_predictions = dialect_similarity(
833
+ transcript, processed_audio, processed_sr
834
+ )
835
+
836
+ # Also try embedding-based prediction as fallback
837
+ embedding_region, embedding_scores = predict_dialect(audio_path)
838
+
839
+ # Always use transcription-based prediction if available (it should always work)
840
+ if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
841
+ # Use transcription-based prediction
842
+ predicted_region = sorted_predictions[0][0]
843
+ scores = similarity_scores
844
+ top_score = sorted_predictions[0][1]
845
+ logger.info(f"Using transcription-based prediction: {predicted_region} (score: {top_score:.4f})")
846
+
847
+ # Log top 3 predictions for debugging
848
+ if len(sorted_predictions) >= 3:
849
+ logger.info(f"Top 3 predictions: {[(r, f'{s:.4f}') for r, s in sorted_predictions[:3]]}")
850
+ elif embedding_scores and embedding_region != "Bilinmiyor" and max(embedding_scores.values()) > 0.01:
851
+ # Fallback to embedding-based
852
+ predicted_region = embedding_region
853
+ scores = embedding_scores
854
+ logger.info(f"Using embedding-based prediction: {predicted_region} (score: {max(embedding_scores.values()):.4f})")
855
+ else:
856
+ # Last resort: ensure we always return a region
857
+ if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
858
+ predicted_region = sorted_predictions[0][0]
859
+ scores = similarity_scores
860
+ logger.warning(f"Using transcription-based with low scores: {predicted_region} (score: {sorted_predictions[0][1]:.4f})")
861
+ elif similarity_scores:
862
+ # Use first region from scores even if sorted_predictions is empty
863
+ predicted_region = max(similarity_scores, key=similarity_scores.get)
864
+ scores = similarity_scores
865
+ logger.warning(f"Using first region from scores: {predicted_region}")
866
+ else:
867
+ # Absolute last resort: use first region from DIALECT_PROFILES
868
+ predicted_region = list(DIALECT_PROFILES.keys())[0] if DIALECT_PROFILES else "Bilinmiyor"
869
+ scores = {region: 0.1 for region in DIALECT_PROFILES.keys()} if DIALECT_PROFILES else {}
870
+ logger.error(f"All prediction methods failed, using fallback: {predicted_region}")
871
+
872
  reply_text = generate_reply_text(predicted_region)
873
  reply_audio_path = synthesize_elevenlabs(reply_text) or None
874
  heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None)
 
945
  }
946
 
947
  h1 {
948
+ font-weight: 800 !important;
949
+ letter-spacing: -2.5px !important;
950
+ color: #1D1D1F !important;
951
+ margin: 0 !important;
952
+ background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%) !important;
953
+ background-size: 200% auto !important;
954
+ -webkit-background-clip: text !important;
955
+ -webkit-text-fill-color: #1D1D1F !important;
956
+ background-clip: text !important;
957
+ animation: shimmer 3s linear infinite !important;
958
+ opacity: 1 !important;
959
+ z-index: 10 !important;
960
+ position: relative !important;
961
+ visibility: visible !important;
962
  }
963
 
964
  .card {
965
+ background: rgba(255, 255, 255, 0.85) !important;
966
+ backdrop-filter: blur(30px) saturate(180%) !important;
967
+ -webkit-backdrop-filter: blur(30px) saturate(180%) !important;
968
+ padding: 28px !important;
969
+ border-radius: 20px !important;
970
+ border: 1px solid rgba(0, 0, 0, 0.08) !important;
971
+ margin-bottom: 20px !important;
972
  box-shadow:
973
+ 0 8px 32px rgba(0, 0, 0, 0.06),
974
+ 0 4px 16px rgba(0, 0, 0, 0.04),
975
+ 0 2px 8px rgba(0, 0, 0, 0.03),
976
+ inset 0 1px 0 rgba(255, 255, 255, 0.9) !important;
977
+ transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
 
978
  position: relative;
979
  overflow: hidden;
980
  }
 
1118
  color: #1D1D1F;
1119
  }
1120
 
1121
+ .markdown h1, .header-markdown h1, .main-title {
1122
+ color: #1D1D1F !important;
1123
+ margin-bottom: 16px !important;
1124
+ font-size: 3.5rem !important;
1125
+ font-weight: 800 !important;
1126
+ letter-spacing: -2px !important;
1127
+ line-height: 1.2 !important;
1128
+ text-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
1129
+ -webkit-text-fill-color: #1D1D1F !important;
1130
+ background: none !important;
1131
+ opacity: 1 !important;
1132
+ z-index: 10 !important;
1133
+ position: relative !important;
1134
+ visibility: visible !important;
1135
  }
1136
 
1137
  .markdown p {
 
1217
 
1218
  gr.Markdown(
1219
  """
1220
+ <div class="header-container" style="text-align:center; margin:50px 0 40px 0; padding: 0 20px; position: relative; z-index: 10;">
1221
+ <h1 class="main-title" style="font-size:3.5rem; font-weight:800; letter-spacing:-2px; margin-bottom:16px; line-height:1.2; color: #1D1D1F !important; opacity: 1 !important; visibility: visible !important; position: relative; z-index: 10; text-shadow: 0 2px 8px rgba(0,0,0,0.1);">
1222
+ <span style="color: #1D1D1F; font-size: 3.5rem; display: inline-block;">🇹🇷</span> Dialect Intelligence Engine
1223
  </h1>
1224
+ <p style="color: #6E6E73; font-size:1.15rem; font-weight:400; letter-spacing:-0.2px; opacity:0.9; margin-top:8px;">
1225
  Powered by Meta Omnilingual ASR & Whisper Large-v3
1226
  </p>
1227
+ </div>
1228
+ """,
1229
+ elem_classes="header-markdown"
1230
  )
1231
 
1232
  gr.Markdown(
1233
  """
1234
  <div style="text-align:center; margin-top:-20px; margin-bottom:40px; color:#6E6E73;">
1235
  Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin.
1236
+ </div>
1237
  """
1238
  )
1239
 
 
1276
  label="Model Cevabı (Ses)",
1277
  type="filepath",
1278
  interactive=False,
1279
+ autoplay=True,
1280
  elem_classes="card"
1281
  )
1282
 
 
1285
  elem_classes="card"
1286
  )
1287
 
1288
+ def analyze_and_reply_with_autoplay(audio_path):
1289
+ """Wrapper to ensure audio autoplays after generation"""
1290
+ result = analyze_and_reply(audio_path)
1291
+ # Return result - Gradio will handle autoplay if autoplay=True is set
1292
+ return result
1293
+
1294
  audio_input.change(
1295
+ fn=analyze_and_reply_with_autoplay,
1296
  inputs=audio_input,
1297
  outputs=[
1298
  transcript_output,
 
1302
  region_map
1303
  ]
1304
  )
1305
+
1306
+ # Add JavaScript for autoplay
1307
+ demo.load(
1308
+ fn=None,
1309
+ js="""
1310
+ function() {
1311
+ // Auto-play audio when it's updated
1312
+ const observer = new MutationObserver(function(mutations) {
1313
+ mutations.forEach(function(mutation) {
1314
+ mutation.addedNodes.forEach(function(node) {
1315
+ if (node.nodeType === 1) {
1316
+ const audio = node.querySelector('audio');
1317
+ if (audio && audio.src && !audio.hasAttribute('data-autoplayed')) {
1318
+ audio.setAttribute('data-autoplayed', 'true');
1319
+ audio.play().catch(e => console.log('Autoplay prevented:', e));
1320
+ }
1321
+ }
1322
+ });
1323
+ });
1324
+ });
1325
+
1326
+ observer.observe(document.body, {
1327
+ childList: true,
1328
+ subtree: true
1329
+ });
1330
+ }
1331
+ """
1332
+ )
1333
+
1334
+ # Auto-play audio when it's generated using JavaScript callback
1335
+ reply_audio_output.change(
1336
+ fn=None,
1337
+ inputs=None,
1338
+ outputs=None,
1339
+ js="""
1340
+ function() {
1341
+ setTimeout(function() {
1342
+ // Find the audio element by looking for the reply audio component
1343
+ const labels = Array.from(document.querySelectorAll('label'));
1344
+ const replyLabel = labels.find(label =>
1345
+ label.textContent && label.textContent.includes('Model Cevabı (Ses)')
1346
+ );
1347
+
1348
+ if (replyLabel) {
1349
+ const audioContainer = replyLabel.closest('.card') || replyLabel.parentElement;
1350
+ const audioElement = audioContainer ? audioContainer.querySelector('audio') : null;
1351
+
1352
+ if (audioElement && audioElement.src) {
1353
+ // Reset and play
1354
+ audioElement.currentTime = 0;
1355
+ const playPromise = audioElement.play();
1356
+ if (playPromise !== undefined) {
1357
+ playPromise.catch(function(error) {
1358
+ console.log('Autoplay prevented by browser:', error);
1359
+ });
1360
+ }
1361
+ }
1362
+ }
1363
+ }, 800); // Wait for audio to be fully loaded
1364
+ return [];
1365
+ }
1366
+ """
1367
+ )
1368
 
1369
  return demo
1370
 
 
1383
  )
1384
 
1385