VeuReu commited on
Commit
a3d9bb2
·
verified ·
1 Parent(s): 509742b

Upload 3 files

Browse files
Files changed (3) hide show
  1. api.py +2 -2
  2. audio_tools.py +43 -38
  3. character_detection.py +2 -2
api.py CHANGED
@@ -244,8 +244,8 @@ def process_video_job(job_id: str):
244
  if voice_embeddings:
245
  try:
246
  Xv = np.array(voice_embeddings)
247
- v_eps = 1.3
248
- v_min = 1
249
  v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
250
  except Exception as _e:
251
  print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
 
244
  if voice_embeddings:
245
  try:
246
  Xv = np.array(voice_embeddings)
247
+ v_eps = float(epsilon)
248
+ v_min = max(1, int(min_cluster_size))
249
  v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
250
  except Exception as _e:
251
  print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
audio_tools.py CHANGED
@@ -145,11 +145,15 @@ def diarize_audio(
145
  audio = AudioSegment.from_wav(wav_path)
146
  duration = len(audio) / 1000.0
147
 
148
- pipeline = Pipeline.from_pretrained(
149
- "pyannote/speaker-diarization-3.1",
150
- use_auth_token=(hf_token_env or os.getenv("HF_TOKEN"))
151
- )
152
- diarization = pipeline(wav_path)
 
 
 
 
153
 
154
  clips_dir = (base_dir / clips_folder)
155
  clips_dir.mkdir(parents=True, exist_ok=True)
@@ -158,42 +162,43 @@ def diarize_audio(
158
  spk_map: Dict[str, int] = {}
159
  prev_end = 0.0
160
 
161
- for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
162
- start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
163
- if start < prev_end:
164
- start = prev_end
165
- if end <= start:
166
- continue
167
-
168
- seg_dur = end - start
169
- if seg_dur < min_segment_duration:
170
- continue
171
-
172
- if seg_dur > max_segment_duration:
173
- n = int(math.ceil(seg_dur / max_segment_duration))
174
- sub_d = seg_dur / n
175
- for j in range(n):
176
- s = start + j * sub_d
177
- e = min(end, start + (j + 1) * sub_d)
178
- if e <= s:
179
- continue
180
- clip = audio[int(s * 1000):int(e * 1000)]
181
- cp = clips_dir / f"segment_{i:03d}_{j:02d}.wav"
 
 
 
 
 
 
 
 
 
 
182
  clip.export(cp, format="wav")
183
  if speaker not in spk_map:
184
  spk_map[speaker] = len(spk_map)
185
- segments.append({"start": s, "end": e, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
186
  clip_paths.append(str(cp))
187
- prev_end = e
188
- else:
189
- clip = audio[int(start * 1000):int(end * 1000)]
190
- cp = clips_dir / f"segment_{i:03d}.wav"
191
- clip.export(cp, format="wav")
192
- if speaker not in spk_map:
193
- spk_map[speaker] = len(spk_map)
194
- segments.append({"start": start, "end": end, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
195
- clip_paths.append(str(cp))
196
- prev_end = end
197
 
198
  if not segments:
199
  cp = clips_dir / "segment_000.wav"
@@ -441,7 +446,7 @@ def process_audio_for_video(
441
  "speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
442
  "text": trans[i] if i < len(trans) else "",
443
  "voice_embedding": embeddings[i],
444
- "clip_path": str(out_dir / "clips" / f"segment_{i:03d}.wav"),
445
  "lang": "ca",
446
  "lang_prob": 1.0,
447
  }
 
145
  audio = AudioSegment.from_wav(wav_path)
146
  duration = len(audio) / 1000.0
147
 
148
+ diarization = None
149
+ try:
150
+ pipeline = Pipeline.from_pretrained(
151
+ "pyannote/speaker-diarization-3.1",
152
+ use_auth_token=(hf_token_env or os.getenv("HF_TOKEN"))
153
+ )
154
+ diarization = pipeline(wav_path)
155
+ except Exception as e:
156
+ log.warning(f"Diarization unavailable, using single full segment fallback: {e}")
157
 
158
  clips_dir = (base_dir / clips_folder)
159
  clips_dir.mkdir(parents=True, exist_ok=True)
 
162
  spk_map: Dict[str, int] = {}
163
  prev_end = 0.0
164
 
165
+ if diarization is not None:
166
+ for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
167
+ start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
168
+ if start < prev_end:
169
+ start = prev_end
170
+ if end <= start:
171
+ continue
172
+
173
+ seg_dur = end - start
174
+ if seg_dur < min_segment_duration:
175
+ continue
176
+
177
+ if seg_dur > max_segment_duration:
178
+ n = int(math.ceil(seg_dur / max_segment_duration))
179
+ sub_d = seg_dur / n
180
+ for j in range(n):
181
+ s = start + j * sub_d
182
+ e = min(end, start + (j + 1) * sub_d)
183
+ if e <= s:
184
+ continue
185
+ clip = audio[int(s * 1000):int(e * 1000)]
186
+ cp = clips_dir / f"segment_{i:03d}_{j:02d}.wav"
187
+ clip.export(cp, format="wav")
188
+ if speaker not in spk_map:
189
+ spk_map[speaker] = len(spk_map)
190
+ segments.append({"start": s, "end": e, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
191
+ clip_paths.append(str(cp))
192
+ prev_end = e
193
+ else:
194
+ clip = audio[int(start * 1000):int(end * 1000)]
195
+ cp = clips_dir / f"segment_{i:03d}.wav"
196
  clip.export(cp, format="wav")
197
  if speaker not in spk_map:
198
  spk_map[speaker] = len(spk_map)
199
+ segments.append({"start": start, "end": end, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
200
  clip_paths.append(str(cp))
201
+ prev_end = end
 
 
 
 
 
 
 
 
 
202
 
203
  if not segments:
204
  cp = clips_dir / "segment_000.wav"
 
446
  "speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
447
  "text": trans[i] if i < len(trans) else "",
448
  "voice_embedding": embeddings[i],
449
+ "clip_path": clip_paths[i] if i < len(clip_paths) else str(out_dir / "clips" / f"segment_{i:03d}.wav"),
450
  "lang": "ca",
451
  "lang_prob": 1.0,
452
  }
character_detection.py CHANGED
@@ -55,8 +55,8 @@ class CharacterDetector:
55
  d.mkdir(parents=True, exist_ok=True)
56
 
57
  def extract_faces_embeddings(self, *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5,
58
- detector_backend: str = 'retinaface', min_face_area: int = 900,
59
- enforce_detection: bool = True) -> List[Dict[str, Any]]:
60
  """
61
  Extrae caras del vídeo y calcula sus embeddings usando DeepFace directamente.
62
 
 
55
  d.mkdir(parents=True, exist_ok=True)
56
 
57
  def extract_faces_embeddings(self, *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5,
58
+ detector_backend: str = 'mtcnn', min_face_area: int = 400,
59
+ enforce_detection: bool = False) -> List[Dict[str, Any]]:
60
  """
61
  Extrae caras del vídeo y calcula sus embeddings usando DeepFace directamente.
62