Upload 3 files
Browse files- api.py +2 -2
- audio_tools.py +43 -38
- character_detection.py +2 -2
api.py
CHANGED
|
@@ -244,8 +244,8 @@ def process_video_job(job_id: str):
|
|
| 244 |
if voice_embeddings:
|
| 245 |
try:
|
| 246 |
Xv = np.array(voice_embeddings)
|
| 247 |
-
v_eps =
|
| 248 |
-
v_min = 1
|
| 249 |
v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
|
| 250 |
except Exception as _e:
|
| 251 |
print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
|
|
|
|
| 244 |
if voice_embeddings:
|
| 245 |
try:
|
| 246 |
Xv = np.array(voice_embeddings)
|
| 247 |
+
v_eps = float(epsilon)
|
| 248 |
+
v_min = max(1, int(min_cluster_size))
|
| 249 |
v_labels = DBSCAN(eps=v_eps, min_samples=v_min, metric='euclidean').fit(Xv).labels_.tolist()
|
| 250 |
except Exception as _e:
|
| 251 |
print(f"[{job_id}] WARN - Voice clustering failed: {_e}")
|
audio_tools.py
CHANGED
|
@@ -145,11 +145,15 @@ def diarize_audio(
|
|
| 145 |
audio = AudioSegment.from_wav(wav_path)
|
| 146 |
duration = len(audio) / 1000.0
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
clips_dir = (base_dir / clips_folder)
|
| 155 |
clips_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -158,42 +162,43 @@ def diarize_audio(
|
|
| 158 |
spk_map: Dict[str, int] = {}
|
| 159 |
prev_end = 0.0
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
start
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
clip.export(cp, format="wav")
|
| 183 |
if speaker not in spk_map:
|
| 184 |
spk_map[speaker] = len(spk_map)
|
| 185 |
-
segments.append({"start":
|
| 186 |
clip_paths.append(str(cp))
|
| 187 |
-
prev_end =
|
| 188 |
-
else:
|
| 189 |
-
clip = audio[int(start * 1000):int(end * 1000)]
|
| 190 |
-
cp = clips_dir / f"segment_{i:03d}.wav"
|
| 191 |
-
clip.export(cp, format="wav")
|
| 192 |
-
if speaker not in spk_map:
|
| 193 |
-
spk_map[speaker] = len(spk_map)
|
| 194 |
-
segments.append({"start": start, "end": end, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
|
| 195 |
-
clip_paths.append(str(cp))
|
| 196 |
-
prev_end = end
|
| 197 |
|
| 198 |
if not segments:
|
| 199 |
cp = clips_dir / "segment_000.wav"
|
|
@@ -441,7 +446,7 @@ def process_audio_for_video(
|
|
| 441 |
"speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
|
| 442 |
"text": trans[i] if i < len(trans) else "",
|
| 443 |
"voice_embedding": embeddings[i],
|
| 444 |
-
"clip_path": str(out_dir / "clips" / f"segment_{i:03d}.wav"),
|
| 445 |
"lang": "ca",
|
| 446 |
"lang_prob": 1.0,
|
| 447 |
}
|
|
|
|
| 145 |
audio = AudioSegment.from_wav(wav_path)
|
| 146 |
duration = len(audio) / 1000.0
|
| 147 |
|
| 148 |
+
diarization = None
|
| 149 |
+
try:
|
| 150 |
+
pipeline = Pipeline.from_pretrained(
|
| 151 |
+
"pyannote/speaker-diarization-3.1",
|
| 152 |
+
use_auth_token=(hf_token_env or os.getenv("HF_TOKEN"))
|
| 153 |
+
)
|
| 154 |
+
diarization = pipeline(wav_path)
|
| 155 |
+
except Exception as e:
|
| 156 |
+
log.warning(f"Diarization unavailable, using single full segment fallback: {e}")
|
| 157 |
|
| 158 |
clips_dir = (base_dir / clips_folder)
|
| 159 |
clips_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 162 |
spk_map: Dict[str, int] = {}
|
| 163 |
prev_end = 0.0
|
| 164 |
|
| 165 |
+
if diarization is not None:
|
| 166 |
+
for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
|
| 167 |
+
start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
|
| 168 |
+
if start < prev_end:
|
| 169 |
+
start = prev_end
|
| 170 |
+
if end <= start:
|
| 171 |
+
continue
|
| 172 |
+
|
| 173 |
+
seg_dur = end - start
|
| 174 |
+
if seg_dur < min_segment_duration:
|
| 175 |
+
continue
|
| 176 |
+
|
| 177 |
+
if seg_dur > max_segment_duration:
|
| 178 |
+
n = int(math.ceil(seg_dur / max_segment_duration))
|
| 179 |
+
sub_d = seg_dur / n
|
| 180 |
+
for j in range(n):
|
| 181 |
+
s = start + j * sub_d
|
| 182 |
+
e = min(end, start + (j + 1) * sub_d)
|
| 183 |
+
if e <= s:
|
| 184 |
+
continue
|
| 185 |
+
clip = audio[int(s * 1000):int(e * 1000)]
|
| 186 |
+
cp = clips_dir / f"segment_{i:03d}_{j:02d}.wav"
|
| 187 |
+
clip.export(cp, format="wav")
|
| 188 |
+
if speaker not in spk_map:
|
| 189 |
+
spk_map[speaker] = len(spk_map)
|
| 190 |
+
segments.append({"start": s, "end": e, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
|
| 191 |
+
clip_paths.append(str(cp))
|
| 192 |
+
prev_end = e
|
| 193 |
+
else:
|
| 194 |
+
clip = audio[int(start * 1000):int(end * 1000)]
|
| 195 |
+
cp = clips_dir / f"segment_{i:03d}.wav"
|
| 196 |
clip.export(cp, format="wav")
|
| 197 |
if speaker not in spk_map:
|
| 198 |
spk_map[speaker] = len(spk_map)
|
| 199 |
+
segments.append({"start": start, "end": end, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
|
| 200 |
clip_paths.append(str(cp))
|
| 201 |
+
prev_end = end
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
if not segments:
|
| 204 |
cp = clips_dir / "segment_000.wav"
|
|
|
|
| 446 |
"speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
|
| 447 |
"text": trans[i] if i < len(trans) else "",
|
| 448 |
"voice_embedding": embeddings[i],
|
| 449 |
+
"clip_path": clip_paths[i] if i < len(clip_paths) else str(out_dir / "clips" / f"segment_{i:03d}.wav"),
|
| 450 |
"lang": "ca",
|
| 451 |
"lang_prob": 1.0,
|
| 452 |
}
|
character_detection.py
CHANGED
|
@@ -55,8 +55,8 @@ class CharacterDetector:
|
|
| 55 |
d.mkdir(parents=True, exist_ok=True)
|
| 56 |
|
| 57 |
def extract_faces_embeddings(self, *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5,
|
| 58 |
-
detector_backend: str = '
|
| 59 |
-
enforce_detection: bool =
|
| 60 |
"""
|
| 61 |
Extrae caras del vídeo y calcula sus embeddings usando DeepFace directamente.
|
| 62 |
|
|
|
|
| 55 |
d.mkdir(parents=True, exist_ok=True)
|
| 56 |
|
| 57 |
def extract_faces_embeddings(self, *, start_offset_sec: float = 3.0, extract_every_sec: float = 0.5,
|
| 58 |
+
detector_backend: str = 'mtcnn', min_face_area: int = 400,
|
| 59 |
+
enforce_detection: bool = False) -> List[Dict[str, Any]]:
|
| 60 |
"""
|
| 61 |
Extrae caras del vídeo y calcula sus embeddings usando DeepFace directamente.
|
| 62 |
|