Spaces:

nisacayir
/

dialect-map-turkiye

Paused

App Files Files Community

nisacayir commited on Nov 27

Commit

3d3e0f1

verified ·

1 Parent(s): b74a45a

update 11labs

Browse files

Files changed (1) hide show

app.py +167 -60

app.py CHANGED Viewed

@@ -1,10 +1,3 @@
-"""
-Dialect Intelligence Engine
-Author: Nisa Çayır
-Core: Meta Omnilingual ASR + Whisper v3 + Türkçe Bölgesel Fonetik Analiz + ElevenLabs AI Dialog
-Focus: Turkish dialects: vowel shifts, markers, prosody, phonetic signatures
-"""
 # =========================================
 # ENV FIXES
 # =========================================
@@ -73,11 +66,11 @@ logger.info(f"Using device: {DEVICE}, dtype: {DTYPE}")
 # MODEL INITIALIZATION
 # =========================================
 try:
-    processor = AutoProcessor.from_pretrained(MODEL_ID)
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(
-        MODEL_ID,
-        torch_dtype=DTYPE
     )
     model = model.to(DEVICE)
@@ -445,10 +438,10 @@ def run_asr(audio_data: np.ndarray, sample_rate: int) -> str:
         # Ensure audio is float32 (Whisper expects fp32 input)
         audio_float = audio_data.astype(np.float32)
-        inputs = processor(
             audio_float,
             sampling_rate=sample_rate,
-            return_tensors="pt"
         )
         # Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU)
@@ -682,7 +675,7 @@ def dialect_similarity(
             scores[region] = round(combined_score, 3)
-            logger.debug(
                 f"{region}: vowel={vowel_score:.3f}, "
                 f"marker={marker_score_val:.3f}, "
                 f"prosody={prosody_score_val:.3f}, "
@@ -835,7 +828,47 @@ def analyze_and_reply(
         empty_fig = build_empty_fig()
         return str(e), "", "", None, empty_fig
-    predicted_region, scores = predict_dialect(audio_path)
     reply_text = generate_reply_text(predicted_region)
     reply_audio_path = synthesize_elevenlabs(reply_text) or None
     heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None)
@@ -912,33 +945,36 @@ body::before {
 }
 h1 {
-    font-weight: 800;
-    letter-spacing: -2.5px;
-    color: #1D1D1F;
-    margin: 0;
-    background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%);
-    background-size: 200% auto;
-    -webkit-background-clip: text;
-    -webkit-text-fill-color: transparent;
-    background-clip: text;
-    animation: shimmer 3s linear infinite;
 }
 .card {
-    background: rgba(255, 255, 255, 0.7) !important;
-    backdrop-filter: blur(40px) saturate(200%) !important;
-    -webkit-backdrop-filter: blur(40px) saturate(200%) !important;
-    padding: 32px !important;
-    border-radius: 24px !important;
-    border: 1px solid rgba(255, 255, 255, 0.8) !important;
-    margin-bottom: 24px !important;
     box-shadow:
-        0 20px 60px rgba(0, 0, 0, 0.08),
-        0 8px 24px rgba(0, 0, 0, 0.06),
-        0 2px 8px rgba(0, 0, 0, 0.04),
-        inset 0 1px 0 rgba(255, 255, 255, 0.95),
-        inset 0 -1px 0 rgba(255, 255, 255, 0.5) !important;
-    transition: all 0.4s cubic-bezier(0.34, 1.56, 0.64, 1) !important;
     position: relative;
     overflow: hidden;
 }
@@ -1082,19 +1118,20 @@ button.primary:active {
     color: #1D1D1F;
 }
-.markdown h1 {
-    color: #1D1D1F;
-    margin-bottom: 20px;
-    font-size: 3.5rem;
-    font-weight: 800;
-    letter-spacing: -3px;
-    line-height: 1.1;
-    background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%);
-    background-size: 200% auto;
-    -webkit-background-clip: text;
-    -webkit-text-fill-color: transparent;
-    background-clip: text;
-    animation: shimmer 4s linear infinite;
 }
 .markdown p {
@@ -1180,22 +1217,23 @@ def build_ui() -> gr.Blocks:
         gr.Markdown(
             """
-            <div style="text-align:center; margin:80px 0 60px 0; padding: 0 20px;">
-                <h1 style="font-size:4rem; font-weight:800; letter-spacing:-3px; margin-bottom:20px; line-height:1.1; background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%); background-size: 200% auto; -webkit-background-clip: text; -webkit-text-fill-color: transparent; background-clip: text; animation: shimmer 4s linear infinite;">
-                    <span style="background: none; -webkit-text-fill-color: initial; color: #1D1D1F;">🇹🇷</span> Dialect Intelligence Engine
                 </h1>
-                <p style="color: #6E6E73; font-size:1.25rem; font-weight:400; letter-spacing:-0.3px; opacity:0.95; margin-top:12px;">
                     Powered by Meta Omnilingual ASR & Whisper Large-v3
                 </p>
-        </div>
-            """
         )
         gr.Markdown(
             """
             <div style="text-align:center; margin-top:-20px; margin-bottom:40px; color:#6E6E73;">
                 Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin.
-            </div>
             """
         )
@@ -1238,6 +1276,7 @@ def build_ui() -> gr.Blocks:
                     label="Model Cevabı (Ses)",
                     type="filepath",
                     interactive=False,
                     elem_classes="card"
                 )
@@ -1246,8 +1285,14 @@ def build_ui() -> gr.Blocks:
             elem_classes="card"
         )
         audio_input.change(
-            fn=analyze_and_reply,
             inputs=audio_input,
             outputs=[
                 transcript_output,
@@ -1257,6 +1302,69 @@ def build_ui() -> gr.Blocks:
                 region_map
             ]
         )
     return demo
@@ -1275,4 +1383,3 @@ if __name__ == "__main__":
     )

 # =========================================
 # ENV FIXES
 # =========================================
 # MODEL INITIALIZATION
 # =========================================
 try:
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    MODEL_ID,
+    torch_dtype=DTYPE
     )
     model = model.to(DEVICE)
         # Ensure audio is float32 (Whisper expects fp32 input)
         audio_float = audio_data.astype(np.float32)
+    inputs = processor(
             audio_float,
             sampling_rate=sample_rate,
+        return_tensors="pt"
         )
         # Move to device and cast to target dtype (fp16 on GPU, fp32 on CPU)
             scores[region] = round(combined_score, 3)
+            logger.info(
                 f"{region}: vowel={vowel_score:.3f}, "
                 f"marker={marker_score_val:.3f}, "
                 f"prosody={prosody_score_val:.3f}, "
         empty_fig = build_empty_fig()
         return str(e), "", "", None, empty_fig
+    # Use transcription-based dialect similarity analysis
+    similarity_scores, sorted_predictions = dialect_similarity(
+        transcript, processed_audio, processed_sr
+    )
+    # Also try embedding-based prediction as fallback
+    embedding_region, embedding_scores = predict_dialect(audio_path)
+    # Always use transcription-based prediction if available (it should always work)
+    if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
+        # Use transcription-based prediction
+        predicted_region = sorted_predictions[0][0]
+        scores = similarity_scores
+        top_score = sorted_predictions[0][1]
+        logger.info(f"Using transcription-based prediction: {predicted_region} (score: {top_score:.4f})")
+        # Log top 3 predictions for debugging
+        if len(sorted_predictions) >= 3:
+            logger.info(f"Top 3 predictions: {[(r, f'{s:.4f}') for r, s in sorted_predictions[:3]]}")
+    elif embedding_scores and embedding_region != "Bilinmiyor" and max(embedding_scores.values()) > 0.01:
+        # Fallback to embedding-based
+        predicted_region = embedding_region
+        scores = embedding_scores
+        logger.info(f"Using embedding-based prediction: {predicted_region} (score: {max(embedding_scores.values()):.4f})")
+    else:
+        # Last resort: ensure we always return a region
+        if similarity_scores and sorted_predictions and len(sorted_predictions) > 0:
+            predicted_region = sorted_predictions[0][0]
+            scores = similarity_scores
+            logger.warning(f"Using transcription-based with low scores: {predicted_region} (score: {sorted_predictions[0][1]:.4f})")
+        elif similarity_scores:
+            # Use first region from scores even if sorted_predictions is empty
+            predicted_region = max(similarity_scores, key=similarity_scores.get)
+            scores = similarity_scores
+            logger.warning(f"Using first region from scores: {predicted_region}")
+        else:
+            # Absolute last resort: use first region from DIALECT_PROFILES
+            predicted_region = list(DIALECT_PROFILES.keys())[0] if DIALECT_PROFILES else "Bilinmiyor"
+            scores = {region: 0.1 for region in DIALECT_PROFILES.keys()} if DIALECT_PROFILES else {}
+            logger.error(f"All prediction methods failed, using fallback: {predicted_region}")
     reply_text = generate_reply_text(predicted_region)
     reply_audio_path = synthesize_elevenlabs(reply_text) or None
     heatmap_fig = plot_region_heatmap(scores, highlight_region=predicted_region if scores else None)
 }
 h1 {
+    font-weight: 800 !important;
+    letter-spacing: -2.5px !important;
+    color: #1D1D1F !important;
+    margin: 0 !important;
+    background: linear-gradient(135deg, #1D1D1F 0%, #4A5568 50%, #1D1D1F 100%) !important;
+    background-size: 200% auto !important;
+    -webkit-background-clip: text !important;
+    -webkit-text-fill-color: #1D1D1F !important;
+    background-clip: text !important;
+    animation: shimmer 3s linear infinite !important;
+    opacity: 1 !important;
+    z-index: 10 !important;
+    position: relative !important;
+    visibility: visible !important;
 }
 .card {
+    background: rgba(255, 255, 255, 0.85) !important;
+    backdrop-filter: blur(30px) saturate(180%) !important;
+    -webkit-backdrop-filter: blur(30px) saturate(180%) !important;
+    padding: 28px !important;
+    border-radius: 20px !important;
+    border: 1px solid rgba(0, 0, 0, 0.08) !important;
+    margin-bottom: 20px !important;
     box-shadow:
+        0 8px 32px rgba(0, 0, 0, 0.06),
+        0 4px 16px rgba(0, 0, 0, 0.04),
+        0 2px 8px rgba(0, 0, 0, 0.03),
+        inset 0 1px 0 rgba(255, 255, 255, 0.9) !important;
+    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
     position: relative;
     overflow: hidden;
 }
     color: #1D1D1F;
 }
+.markdown h1, .header-markdown h1, .main-title {
+    color: #1D1D1F !important;
+    margin-bottom: 16px !important;
+    font-size: 3.5rem !important;
+    font-weight: 800 !important;
+    letter-spacing: -2px !important;
+    line-height: 1.2 !important;
+    text-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
+    -webkit-text-fill-color: #1D1D1F !important;
+    background: none !important;
+    opacity: 1 !important;
+    z-index: 10 !important;
+    position: relative !important;
+    visibility: visible !important;
 }
 .markdown p {
         gr.Markdown(
             """
+            <div class="header-container" style="text-align:center; margin:50px 0 40px 0; padding: 0 20px; position: relative; z-index: 10;">
+                <h1 class="main-title" style="font-size:3.5rem; font-weight:800; letter-spacing:-2px; margin-bottom:16px; line-height:1.2; color: #1D1D1F !important; opacity: 1 !important; visibility: visible !important; position: relative; z-index: 10; text-shadow: 0 2px 8px rgba(0,0,0,0.1);">
+                    <span style="color: #1D1D1F; font-size: 3.5rem; display: inline-block;">🇹🇷</span> Dialect Intelligence Engine
                 </h1>
+                <p style="color: #6E6E73; font-size:1.15rem; font-weight:400; letter-spacing:-0.2px; opacity:0.9; margin-top:8px;">
                     Powered by Meta Omnilingual ASR & Whisper Large-v3
                 </p>
+            </div>
+            """,
+            elem_classes="header-markdown"
         )
         gr.Markdown(
             """
             <div style="text-align:center; margin-top:-20px; margin-bottom:40px; color:#6E6E73;">
                 Mikrofona bas, doğal bir şekilde konuş. Sistem şiveni analiz edip seni haritada işaretlesin ve AI sesiyle cevap versin.
+        </div>
             """
         )
                     label="Model Cevabı (Ses)",
                     type="filepath",
                     interactive=False,
+                    autoplay=True,
                     elem_classes="card"
                 )
             elem_classes="card"
         )
+        def analyze_and_reply_with_autoplay(audio_path):
+            """Wrapper to ensure audio autoplays after generation"""
+            result = analyze_and_reply(audio_path)
+            # Return result - Gradio will handle autoplay if autoplay=True is set
+            return result
         audio_input.change(
+            fn=analyze_and_reply_with_autoplay,
             inputs=audio_input,
             outputs=[
                 transcript_output,
                 region_map
             ]
         )
+        # Add JavaScript for autoplay
+        demo.load(
+            fn=None,
+            js="""
+            function() {
+                // Auto-play audio when it's updated
+                const observer = new MutationObserver(function(mutations) {
+                    mutations.forEach(function(mutation) {
+                        mutation.addedNodes.forEach(function(node) {
+                            if (node.nodeType === 1) {
+                                const audio = node.querySelector('audio');
+                                if (audio && audio.src && !audio.hasAttribute('data-autoplayed')) {
+                                    audio.setAttribute('data-autoplayed', 'true');
+                                    audio.play().catch(e => console.log('Autoplay prevented:', e));
+                                }
+                            }
+                        });
+                    });
+                });
+                observer.observe(document.body, {
+                    childList: true,
+                    subtree: true
+                });
+            }
+            """
+        )
+        # Auto-play audio when it's generated using JavaScript callback
+        reply_audio_output.change(
+            fn=None,
+            inputs=None,
+            outputs=None,
+            js="""
+            function() {
+                setTimeout(function() {
+                    // Find the audio element by looking for the reply audio component
+                    const labels = Array.from(document.querySelectorAll('label'));
+                    const replyLabel = labels.find(label =>
+                        label.textContent && label.textContent.includes('Model Cevabı (Ses)')
+                    );
+                    if (replyLabel) {
+                        const audioContainer = replyLabel.closest('.card') || replyLabel.parentElement;
+                        const audioElement = audioContainer ? audioContainer.querySelector('audio') : null;
+                        if (audioElement && audioElement.src) {
+                            // Reset and play
+                            audioElement.currentTime = 0;
+                            const playPromise = audioElement.play();
+                            if (playPromise !== undefined) {
+                                playPromise.catch(function(error) {
+                                    console.log('Autoplay prevented by browser:', error);
+                                });
+                            }
+                        }
+                    }
+                }, 800); // Wait for audio to be fully loaded
+                return [];
+            }
+            """
+        )
     return demo
     )