Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on Oct 28

Commit

a3d9bb2

verified ·

1 Parent(s): 509742b

Upload 3 files

Browse files

Files changed (3) hide show

api.py +2 -2
audio_tools.py +43 -38
character_detection.py +2 -2

api.py CHANGED Viewed

@@ -244,8 +244,8 @@ def process_video_job(job_id: str):
             if voice_embeddings:
                 try:
                     Xv = np.array(voice_embeddings)
-                    v_eps = 1.3
-                    v_min = 1
                     v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
                 except Exception as _e:
                     print(f"[{job_id}] WARN - Voice clustering failed: {_e}")

             if voice_embeddings:
                 try:
                     Xv = np.array(voice_embeddings)
+                    v_eps = float(epsilon)
+                    v_min = max(1, int(min_cluster_size))
                     v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
                 except Exception as _e:
                     print(f"[{job_id}] WARN - Voice clustering failed: {_e}")

audio_tools.py CHANGED Viewed

@@ -145,11 +145,15 @@ def diarize_audio(
     audio = AudioSegment.from_wav(wav_path)
     duration = len(audio) / 1000.0
-    pipeline = Pipeline.from_pretrained(
-        "pyannote/speaker-diarization-3.1",
-        use_auth_token=(hf_token_env or os.getenv("HF_TOKEN"))
-    )
-    diarization = pipeline(wav_path)
     clips_dir = (base_dir / clips_folder)
     clips_dir.mkdir(parents=True, exist_ok=True)
@@ -158,42 +162,43 @@ def diarize_audio(
     spk_map: Dict[str, int] = {}
     prev_end = 0.0
-    for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
-        start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
-        if start < prev_end:
-            start = prev_end
-        if end <= start:
-            continue
-        seg_dur = end - start
-        if seg_dur < min_segment_duration:
-            continue
-        if seg_dur > max_segment_duration:
-            n = int(math.ceil(seg_dur / max_segment_duration))
-            sub_d = seg_dur / n
-            for j in range(n):
-                s = start + j * sub_d
-                e = min(end, start + (j + 1) * sub_d)
-                if e <= s:
-                    continue
-                clip = audio[int(s * 1000):int(e * 1000)]
-                cp = clips_dir / f"segment_{i:03d}_{j:02d}.wav"
                 clip.export(cp, format="wav")
                 if speaker not in spk_map:
                     spk_map[speaker] = len(spk_map)
-                segments.append({"start": s, "end": e, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
                 clip_paths.append(str(cp))
-                prev_end = e
-        else:
-            clip = audio[int(start * 1000):int(end * 1000)]
-            cp = clips_dir / f"segment_{i:03d}.wav"
-            clip.export(cp, format="wav")
-            if speaker not in spk_map:
-                spk_map[speaker] = len(spk_map)
-            segments.append({"start": start, "end": end, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
-            clip_paths.append(str(cp))
-            prev_end = end
     if not segments:
         cp = clips_dir / "segment_000.wav"
@@ -441,7 +446,7 @@ def process_audio_for_video(
                 "speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
                 "text": trans[i] if i < len(trans) else "",
                 "voice_embedding": embeddings[i],
-                "clip_path": str(out_dir / "clips" / f"segment_{i:03d}.wav"),
                 "lang": "ca",
                 "lang_prob": 1.0,
             }

     audio = AudioSegment.from_wav(wav_path)
     duration = len(audio) / 1000.0
+    diarization = None
+    try:
+        pipeline = Pipeline.from_pretrained(
+            "pyannote/speaker-diarization-3.1",
+            use_auth_token=(hf_token_env or os.getenv("HF_TOKEN"))
+        )
+        diarization = pipeline(wav_path)
+    except Exception as e:
+        log.warning(f"Diarization unavailable, using single full segment fallback: {e}")
     clips_dir = (base_dir / clips_folder)
     clips_dir.mkdir(parents=True, exist_ok=True)
     spk_map: Dict[str, int] = {}
     prev_end = 0.0
+    if diarization is not None:
+        for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
+            start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
+            if start < prev_end:
+                start = prev_end
+            if end <= start:
+                continue
+            seg_dur = end - start
+            if seg_dur < min_segment_duration:
+                continue
+            if seg_dur > max_segment_duration:
+                n = int(math.ceil(seg_dur / max_segment_duration))
+                sub_d = seg_dur / n
+                for j in range(n):
+                    s = start + j * sub_d
+                    e = min(end, start + (j + 1) * sub_d)
+                    if e <= s:
+                        continue
+                    clip = audio[int(s * 1000):int(e * 1000)]
+                    cp = clips_dir / f"segment_{i:03d}_{j:02d}.wav"
+                    clip.export(cp, format="wav")
+                    if speaker not in spk_map:
+                        spk_map[speaker] = len(spk_map)
+                    segments.append({"start": s, "end": e, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
+                    clip_paths.append(str(cp))
+                    prev_end = e
+            else:
+                clip = audio[int(start * 1000):int(end * 1000)]
+                cp = clips_dir / f"segment_{i:03d}.wav"
                 clip.export(cp, format="wav")
                 if speaker not in spk_map:
                     spk_map[speaker] = len(spk_map)
+                segments.append({"start": start, "end": end, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
                 clip_paths.append(str(cp))
+                prev_end = end
     if not segments:
         cp = clips_dir / "segment_000.wav"
                 "speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
                 "text": trans[i] if i < len(trans) else "",
                 "voice_embedding": embeddings[i],
+                "clip_path": clip_paths[i] if i < len(clip_paths) else str(out_dir / "clips" / f"segment_{i:03d}.wav"),
                 "lang": "ca",
                 "lang_prob": 1.0,
             }

character_detection.py CHANGED Viewed

@@ -55,8 +55,8 @@ class CharacterDetector:
             d.mkdir(parents=True, exist_ok=True)
     def extract_faces_embeddings(self, *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5,
-                                 detector_backend: str = 'retinaface', min_face_area: int = 900,
-                                 enforce_detection: bool = True) -> List[Dict[str, Any]]:
         """
         Extrae caras del vídeo y calcula sus embeddings usando DeepFace directamente.

             d.mkdir(parents=True, exist_ok=True)
     def extract_faces_embeddings(self, *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5,
+                                 detector_backend: str = 'mtcnn', min_face_area: int = 400,
+                                 enforce_detection: bool = False) -> List[Dict[str, Any]]:
         """
         Extrae caras del vídeo y calcula sus embeddings usando DeepFace directamente.