Upload 3 files
Browse files- asr_client.py +140 -132
- preprocessing_router.py +308 -6
- svision_client.py +123 -116
asr_client.py
CHANGED
|
@@ -1,132 +1,140 @@
|
|
| 1 |
-
import os
|
| 2 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
| 3 |
-
|
| 4 |
-
from gradio_client import Client, handle_file
|
| 5 |
-
from typing import Any, Dict, List
|
| 6 |
-
from PIL import Image
|
| 7 |
-
import json
|
| 8 |
-
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
def
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
| 3 |
+
|
| 4 |
+
from gradio_client import Client, handle_file
|
| 5 |
+
from typing import Any, Dict, List
|
| 6 |
+
from PIL import Image
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
# Lazy initialization to avoid crash if Space is down at import time
|
| 10 |
+
_asr_client = None
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _get_asr_client():
|
| 14 |
+
"""Get or create the ASR client (lazy initialization)."""
|
| 15 |
+
global _asr_client
|
| 16 |
+
if _asr_client is None:
|
| 17 |
+
_asr_client = Client("VeuReu/asr")
|
| 18 |
+
return _asr_client
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def extract_audio_from_video(video_path: str) -> str:
|
| 22 |
+
"""
|
| 23 |
+
Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
|
| 24 |
+
|
| 25 |
+
This function uploads a video file to the remote ASR service and extracts its audio track.
|
| 26 |
+
|
| 27 |
+
Parameters
|
| 28 |
+
----------
|
| 29 |
+
video_path : str
|
| 30 |
+
Path to the input video file from which audio will be extracted.
|
| 31 |
+
|
| 32 |
+
Returns
|
| 33 |
+
-------
|
| 34 |
+
str
|
| 35 |
+
Path or identifier of the extracted audio file returned by the remote service.
|
| 36 |
+
"""
|
| 37 |
+
result = _get_asr_client().predict(
|
| 38 |
+
video_file={"video": handle_file(video_path)},
|
| 39 |
+
api_name="/extract_audio_ffmpeg"
|
| 40 |
+
)
|
| 41 |
+
return result
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def diarize_audio(audio_path: str) -> str:
|
| 45 |
+
"""
|
| 46 |
+
Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
|
| 47 |
+
|
| 48 |
+
This function performs speaker diarization, identifying segments of speech
|
| 49 |
+
belonging to different speakers in the audio file.
|
| 50 |
+
|
| 51 |
+
Parameters
|
| 52 |
+
----------
|
| 53 |
+
audio_path : str
|
| 54 |
+
Path to the audio file to be diarized.
|
| 55 |
+
|
| 56 |
+
Returns
|
| 57 |
+
-------
|
| 58 |
+
str
|
| 59 |
+
JSON-like diarization output containing speaker segments and timings.
|
| 60 |
+
"""
|
| 61 |
+
result = _get_asr_client().predict(
|
| 62 |
+
wav_archivo=handle_file(audio_path),
|
| 63 |
+
api_name="/diaritzar_audio"
|
| 64 |
+
)
|
| 65 |
+
return result
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def transcribe_long_audio(audio_path: str) -> str:
|
| 69 |
+
"""
|
| 70 |
+
Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
|
| 71 |
+
|
| 72 |
+
Designed for long audio recordings, this function sends the audio to the ASR model
|
| 73 |
+
optimized for processing extended durations.
|
| 74 |
+
|
| 75 |
+
Parameters
|
| 76 |
+
----------
|
| 77 |
+
audio_path : str
|
| 78 |
+
Path to the long audio file to be transcribed.
|
| 79 |
+
|
| 80 |
+
Returns
|
| 81 |
+
-------
|
| 82 |
+
str
|
| 83 |
+
Transcribed text returned by the remote ASR service.
|
| 84 |
+
"""
|
| 85 |
+
result = _get_asr_client().predict(
|
| 86 |
+
wav_path=handle_file(audio_path),
|
| 87 |
+
api_name="/transcribe_long_audio"
|
| 88 |
+
)
|
| 89 |
+
return result
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def transcribe_short_audio(audio_path: str) -> str:
|
| 93 |
+
"""
|
| 94 |
+
Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
|
| 95 |
+
|
| 96 |
+
This function is optimized for short-duration audio samples and produces fast transcriptions.
|
| 97 |
+
|
| 98 |
+
Parameters
|
| 99 |
+
----------
|
| 100 |
+
audio_path : str
|
| 101 |
+
Path to the short audio file to be transcribed.
|
| 102 |
+
|
| 103 |
+
Returns
|
| 104 |
+
-------
|
| 105 |
+
str
|
| 106 |
+
Transcribed text returned by the remote service.
|
| 107 |
+
"""
|
| 108 |
+
result = _get_asr_client().predict(
|
| 109 |
+
wav_path=handle_file(audio_path),
|
| 110 |
+
api_name="/transcribe_wav"
|
| 111 |
+
)
|
| 112 |
+
return result
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
|
| 116 |
+
"""
|
| 117 |
+
Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
|
| 118 |
+
|
| 119 |
+
This function attempts to identify which known speaker (from a provided
|
| 120 |
+
collection of voice profiles) appears in the given audio clip.
|
| 121 |
+
|
| 122 |
+
Parameters
|
| 123 |
+
----------
|
| 124 |
+
clip_path : str
|
| 125 |
+
Path to the audio clip whose speaker is to be identified.
|
| 126 |
+
voice_col : List[Dict[str, Any]]
|
| 127 |
+
List of dictionaries containing metadata or embeddings for known voices.
|
| 128 |
+
|
| 129 |
+
Returns
|
| 130 |
+
-------
|
| 131 |
+
Any
|
| 132 |
+
Output returned by the remote speaker identification model.
|
| 133 |
+
"""
|
| 134 |
+
voice_col_str = json.dumps(voice_col)
|
| 135 |
+
result = _get_asr_client().predict(
|
| 136 |
+
wav_archivo=handle_file(clip_path),
|
| 137 |
+
voice_col=voice_col_str,
|
| 138 |
+
api_name="/identificar_veu"
|
| 139 |
+
)
|
| 140 |
+
return result
|
preprocessing_router.py
CHANGED
|
@@ -5,10 +5,12 @@ from fastapi.responses import FileResponse
|
|
| 5 |
from pathlib import Path
|
| 6 |
from datetime import datetime
|
| 7 |
from enum import Enum
|
| 8 |
-
from typing import Dict
|
| 9 |
import shutil
|
| 10 |
import os
|
| 11 |
import uuid
|
|
|
|
|
|
|
| 12 |
|
| 13 |
from video_processing import process_video_pipeline
|
| 14 |
from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
|
|
@@ -40,6 +42,70 @@ class JobStatus(str, Enum):
|
|
| 40 |
jobs: Dict[str, dict] = {}
|
| 41 |
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
router = APIRouter(tags=["Preprocessing Manager"])
|
| 44 |
|
| 45 |
|
|
@@ -346,9 +412,245 @@ async def detect_scenes(
|
|
| 346 |
|
| 347 |
|
| 348 |
def process_video_job(job_id: str):
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
from datetime import datetime
|
| 7 |
from enum import Enum
|
| 8 |
+
from typing import Dict, Any
|
| 9 |
import shutil
|
| 10 |
import os
|
| 11 |
import uuid
|
| 12 |
+
import numpy as np
|
| 13 |
+
import cv2
|
| 14 |
|
| 15 |
from video_processing import process_video_pipeline
|
| 16 |
from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
|
|
|
|
| 42 |
jobs: Dict[str, dict] = {}
|
| 43 |
|
| 44 |
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
# Helper functions for face detection and clustering
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
|
| 49 |
+
def normalize_face_lighting(image):
|
| 50 |
+
"""Normalize face brightness using CLAHE and range normalization."""
|
| 51 |
+
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
|
| 52 |
+
l, a, b = cv2.split(lab)
|
| 53 |
+
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
| 54 |
+
l_clahe = clahe.apply(l)
|
| 55 |
+
l_min, l_max = l_clahe.min(), l_clahe.max()
|
| 56 |
+
if l_max > l_min:
|
| 57 |
+
l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
|
| 58 |
+
else:
|
| 59 |
+
l_normalized = l_clahe
|
| 60 |
+
l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
|
| 61 |
+
lab_normalized = cv2.merge([l_normalized, a, b])
|
| 62 |
+
normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
|
| 63 |
+
return normalized
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
|
| 67 |
+
"""Hierarchical clustering with silhouette score and minimum cluster size."""
|
| 68 |
+
from scipy.cluster.hierarchy import linkage, fcluster
|
| 69 |
+
from sklearn.metrics import silhouette_score
|
| 70 |
+
from collections import Counter
|
| 71 |
+
|
| 72 |
+
if len(X) == 0:
|
| 73 |
+
return np.array([])
|
| 74 |
+
if len(X) < min_cluster_size:
|
| 75 |
+
return np.full(len(X), -1, dtype=int)
|
| 76 |
+
|
| 77 |
+
Z = linkage(X, method='average', metric='cosine')
|
| 78 |
+
best_n_clusters = 2
|
| 79 |
+
best_score = -1
|
| 80 |
+
max_to_try = min(max_groups, len(X) - 1)
|
| 81 |
+
|
| 82 |
+
if max_to_try >= 2:
|
| 83 |
+
for n_clusters in range(2, max_to_try + 1):
|
| 84 |
+
trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
|
| 85 |
+
trial_counts = Counter(trial_labels)
|
| 86 |
+
valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
|
| 87 |
+
if valid_clusters >= 2:
|
| 88 |
+
try:
|
| 89 |
+
score = silhouette_score(X, trial_labels, metric='cosine')
|
| 90 |
+
penalty = 0.14 - (sensitivity * 0.13)
|
| 91 |
+
adjusted_score = score - (n_clusters * penalty)
|
| 92 |
+
if adjusted_score > best_score:
|
| 93 |
+
best_score = adjusted_score
|
| 94 |
+
best_n_clusters = n_clusters
|
| 95 |
+
except Exception:
|
| 96 |
+
pass
|
| 97 |
+
|
| 98 |
+
labels = fcluster(Z, t=best_n_clusters, criterion='maxclust') - 1
|
| 99 |
+
label_counts = Counter(labels)
|
| 100 |
+
filtered_labels = []
|
| 101 |
+
for lbl in labels:
|
| 102 |
+
if label_counts[lbl] >= min_cluster_size:
|
| 103 |
+
filtered_labels.append(lbl)
|
| 104 |
+
else:
|
| 105 |
+
filtered_labels.append(-1)
|
| 106 |
+
return np.array(filtered_labels, dtype=int)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
router = APIRouter(tags=["Preprocessing Manager"])
|
| 110 |
|
| 111 |
|
|
|
|
| 412 |
|
| 413 |
|
| 414 |
def process_video_job(job_id: str):
|
| 415 |
+
"""Process video job in background: detect faces, cluster, validate."""
|
| 416 |
+
try:
|
| 417 |
+
job = jobs[job_id]
|
| 418 |
+
print(f"[{job_id}] Iniciando procesamiento...")
|
| 419 |
+
|
| 420 |
+
job["status"] = JobStatus.PROCESSING
|
| 421 |
+
|
| 422 |
+
video_path = job["video_path"]
|
| 423 |
+
video_name = job["video_name"]
|
| 424 |
+
max_groups = int(job.get("max_groups", 5))
|
| 425 |
+
min_cluster_size = int(job.get("min_cluster_size", 3))
|
| 426 |
+
face_sensitivity = float(job.get("face_sensitivity", 0.5))
|
| 427 |
+
|
| 428 |
+
base = TEMP_ROOT / video_name
|
| 429 |
+
base.mkdir(parents=True, exist_ok=True)
|
| 430 |
+
print(f"[{job_id}] Directorio base: {base}")
|
| 431 |
+
|
| 432 |
+
try:
|
| 433 |
+
print(f"[{job_id}] Iniciando detección de personajes...")
|
| 434 |
+
try:
|
| 435 |
+
import face_recognition
|
| 436 |
+
_use_fr = True
|
| 437 |
+
print(f"[{job_id}] face_recognition disponible: CPU")
|
| 438 |
+
except Exception:
|
| 439 |
+
face_recognition = None
|
| 440 |
+
_use_fr = False
|
| 441 |
+
print(f"[{job_id}] face_recognition no disponible. Intentando DeepFace fallback.")
|
| 442 |
+
try:
|
| 443 |
+
from deepface import DeepFace
|
| 444 |
+
except Exception:
|
| 445 |
+
DeepFace = None
|
| 446 |
+
|
| 447 |
+
cap = cv2.VideoCapture(video_path)
|
| 448 |
+
if not cap.isOpened():
|
| 449 |
+
raise RuntimeError("No se pudo abrir el vídeo para extracción de caras")
|
| 450 |
+
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
| 451 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
|
| 452 |
+
max_samples = job.get("max_frames", 100)
|
| 453 |
+
|
| 454 |
+
if total_frames > 0:
|
| 455 |
+
frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
|
| 456 |
+
else:
|
| 457 |
+
frame_indices = []
|
| 458 |
+
print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
|
| 459 |
+
|
| 460 |
+
faces_root = base / "faces_raw"
|
| 461 |
+
faces_root.mkdir(parents=True, exist_ok=True)
|
| 462 |
+
embeddings: list[list[float]] = []
|
| 463 |
+
crops_meta: list[dict] = []
|
| 464 |
+
|
| 465 |
+
saved_count = 0
|
| 466 |
+
frames_processed = 0
|
| 467 |
+
frames_with_faces = 0
|
| 468 |
+
|
| 469 |
+
for frame_idx in frame_indices:
|
| 470 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
|
| 471 |
+
ret2, frame = cap.read()
|
| 472 |
+
if not ret2:
|
| 473 |
+
continue
|
| 474 |
+
frames_processed += 1
|
| 475 |
+
frame_normalized = normalize_face_lighting(frame)
|
| 476 |
+
rgb = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2RGB)
|
| 477 |
+
|
| 478 |
+
if _use_fr and face_recognition is not None:
|
| 479 |
+
boxes = face_recognition.face_locations(rgb, model="hog")
|
| 480 |
+
encs = face_recognition.face_encodings(rgb, boxes)
|
| 481 |
+
if boxes:
|
| 482 |
+
frames_with_faces += 1
|
| 483 |
+
for (top, right, bottom, left), e in zip(boxes, encs):
|
| 484 |
+
crop = frame_normalized[top:bottom, left:right]
|
| 485 |
+
if crop.size == 0:
|
| 486 |
+
continue
|
| 487 |
+
fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
|
| 488 |
+
cv2.imwrite(str(faces_root / fn), crop)
|
| 489 |
+
e = np.array(e, dtype=float)
|
| 490 |
+
e = e / (np.linalg.norm(e) + 1e-9)
|
| 491 |
+
embeddings.append(e.astype(float).tolist())
|
| 492 |
+
crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
|
| 493 |
+
saved_count += 1
|
| 494 |
+
else:
|
| 495 |
+
if DeepFace is not None:
|
| 496 |
+
try:
|
| 497 |
+
gray = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2GRAY)
|
| 498 |
+
haar_path = getattr(cv2.data, 'haarcascades', None) or ''
|
| 499 |
+
face_cascade = cv2.CascadeClassifier(os.path.join(haar_path, 'haarcascade_frontalface_default.xml'))
|
| 500 |
+
boxes_haar = []
|
| 501 |
+
if face_cascade is not None and not face_cascade.empty():
|
| 502 |
+
faces_haar = face_cascade.detectMultiScale(gray, scaleFactor=1.08, minNeighbors=5, minSize=(50, 50))
|
| 503 |
+
for (x, y, w, h) in faces_haar:
|
| 504 |
+
top, left, bottom, right = max(0, y), max(0, x), min(frame.shape[0], y+h), min(frame.shape[1], x+w)
|
| 505 |
+
boxes_haar.append((top, right, bottom, left))
|
| 506 |
+
|
| 507 |
+
if boxes_haar:
|
| 508 |
+
frames_with_faces += 1
|
| 509 |
+
|
| 510 |
+
for (top, right, bottom, left) in boxes_haar:
|
| 511 |
+
crop = frame_normalized[top:bottom, left:right]
|
| 512 |
+
if crop.size == 0:
|
| 513 |
+
continue
|
| 514 |
+
fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
|
| 515 |
+
crop_path = faces_root / fn
|
| 516 |
+
cv2.imwrite(str(crop_path), crop)
|
| 517 |
+
reps = DeepFace.represent(img_path=str(crop_path), model_name="Facenet512", enforce_detection=False)
|
| 518 |
+
for r in (reps or []):
|
| 519 |
+
emb = r.get("embedding") if isinstance(r, dict) else r
|
| 520 |
+
if emb is None:
|
| 521 |
+
continue
|
| 522 |
+
emb = np.array(emb, dtype=float)
|
| 523 |
+
emb = emb / (np.linalg.norm(emb) + 1e-9)
|
| 524 |
+
embeddings.append(emb.astype(float).tolist())
|
| 525 |
+
crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
|
| 526 |
+
saved_count += 1
|
| 527 |
+
except Exception as _e_df:
|
| 528 |
+
print(f"[{job_id}] DeepFace fallback error: {_e_df}")
|
| 529 |
+
cap.release()
|
| 530 |
+
|
| 531 |
+
print(f"[{job_id}] ✓ Frames procesados: {frames_processed}/{len(frame_indices)}")
|
| 532 |
+
print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}")
|
| 533 |
+
print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
|
| 534 |
+
|
| 535 |
+
# Clustering
|
| 536 |
+
if embeddings:
|
| 537 |
+
Xf = np.array(embeddings)
|
| 538 |
+
labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
|
| 539 |
+
print(f"[{job_id}] Clustering: {len(set([l for l in labels if l >= 0]))} clusters")
|
| 540 |
+
else:
|
| 541 |
+
labels = []
|
| 542 |
+
|
| 543 |
+
# Build character folders with validation
|
| 544 |
+
try:
|
| 545 |
+
from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
|
| 546 |
+
except ImportError:
|
| 547 |
+
validate_and_classify_face = None
|
| 548 |
+
FACE_CONFIDENCE_THRESHOLD = 0.5
|
| 549 |
+
|
| 550 |
+
characters_validated: list[dict[str, Any]] = []
|
| 551 |
+
cluster_map: dict[int, list[int]] = {}
|
| 552 |
+
for idx, lbl in enumerate(labels):
|
| 553 |
+
if isinstance(lbl, int) and lbl >= 0:
|
| 554 |
+
cluster_map.setdefault(lbl, []).append(idx)
|
| 555 |
+
|
| 556 |
+
chars_dir = base / "characters"
|
| 557 |
+
chars_dir.mkdir(parents=True, exist_ok=True)
|
| 558 |
+
|
| 559 |
+
for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
|
| 560 |
+
char_id = f"char_{ci:02d}"
|
| 561 |
+
detections: list[dict[str, Any]] = []
|
| 562 |
+
for j in idxs:
|
| 563 |
+
meta = crops_meta[j]
|
| 564 |
+
file_name = meta.get("file")
|
| 565 |
+
if not file_name:
|
| 566 |
+
continue
|
| 567 |
+
box = meta.get("box", [0, 0, 0, 0])
|
| 568 |
+
area = abs(box[1] - box[3]) * abs(box[2] - box[0]) if len(box) >= 4 else 0
|
| 569 |
+
detections.append({"index": j, "file": file_name, "score": area, "box": box})
|
| 570 |
+
|
| 571 |
+
if not detections:
|
| 572 |
+
continue
|
| 573 |
|
| 574 |
+
detections.sort(key=lambda d: d["score"], reverse=True)
|
| 575 |
+
best_face = detections[0]
|
| 576 |
+
best_face_path = faces_root / best_face["file"]
|
| 577 |
+
|
| 578 |
+
# Validation (optional)
|
| 579 |
+
validation = None
|
| 580 |
+
if validate_and_classify_face is not None:
|
| 581 |
+
try:
|
| 582 |
+
validation = validate_and_classify_face(str(best_face_path))
|
| 583 |
+
except Exception:
|
| 584 |
+
validation = None
|
| 585 |
+
|
| 586 |
+
if validation and not validation.get("is_valid_face", True):
|
| 587 |
+
if validation.get("face_confidence", 1.0) < FACE_CONFIDENCE_THRESHOLD:
|
| 588 |
+
continue
|
| 589 |
+
|
| 590 |
+
out_dir = chars_dir / char_id
|
| 591 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 592 |
+
|
| 593 |
+
total_faces = len(detections)
|
| 594 |
+
max_faces_to_show = (total_faces // 2) + 1
|
| 595 |
+
selected = detections[:max_faces_to_show]
|
| 596 |
+
|
| 597 |
+
files: list[str] = []
|
| 598 |
+
file_urls: list[str] = []
|
| 599 |
+
for det in selected:
|
| 600 |
+
fname = det["file"]
|
| 601 |
+
src = faces_root / fname
|
| 602 |
+
dst = out_dir / fname
|
| 603 |
+
try:
|
| 604 |
+
shutil.copy2(src, dst)
|
| 605 |
+
files.append(fname)
|
| 606 |
+
file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
|
| 607 |
+
except Exception:
|
| 608 |
+
pass
|
| 609 |
+
|
| 610 |
+
rep = files[0] if files else None
|
| 611 |
+
if rep:
|
| 612 |
+
try:
|
| 613 |
+
shutil.copy2(out_dir / rep, out_dir / "representative.jpg")
|
| 614 |
+
except Exception:
|
| 615 |
+
pass
|
| 616 |
+
|
| 617 |
+
cluster_number = int(char_id.split("_")[1]) + 1
|
| 618 |
+
character_name = f"Cluster {cluster_number}"
|
| 619 |
+
gender = validation.get("gender", "Neutral") if validation else "Neutral"
|
| 620 |
+
|
| 621 |
+
characters_validated.append({
|
| 622 |
+
"id": char_id,
|
| 623 |
+
"name": character_name,
|
| 624 |
+
"gender": gender,
|
| 625 |
+
"folder": str(out_dir),
|
| 626 |
+
"num_faces": len(files),
|
| 627 |
+
"total_faces_detected": total_faces,
|
| 628 |
+
"image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
|
| 629 |
+
"face_files": file_urls,
|
| 630 |
+
})
|
| 631 |
+
print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
|
| 632 |
+
|
| 633 |
+
print(f"[{job_id}] ✓ Total: {len(characters_validated)} personajes válidos")
|
| 634 |
+
|
| 635 |
+
job["results"] = {
|
| 636 |
+
"characters": characters_validated,
|
| 637 |
+
"face_labels": labels,
|
| 638 |
+
"video_name": video_name,
|
| 639 |
+
"base_dir": str(base),
|
| 640 |
+
}
|
| 641 |
+
job["status"] = JobStatus.DONE
|
| 642 |
+
print(f"[{job_id}] ✓ Procesamiento completado")
|
| 643 |
+
|
| 644 |
+
except Exception as face_error:
|
| 645 |
+
print(f"[{job_id}] Error en detección de caras: {face_error}")
|
| 646 |
+
import traceback
|
| 647 |
+
traceback.print_exc()
|
| 648 |
+
job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
|
| 649 |
+
job["status"] = JobStatus.DONE
|
| 650 |
+
|
| 651 |
+
except Exception as e:
|
| 652 |
+
print(f"[{job_id}] Error general: {e}")
|
| 653 |
+
import traceback
|
| 654 |
+
traceback.print_exc()
|
| 655 |
+
job["status"] = JobStatus.FAILED
|
| 656 |
+
job["error"] = str(e)
|
svision_client.py
CHANGED
|
@@ -1,116 +1,123 @@
|
|
| 1 |
-
import os
|
| 2 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
| 3 |
-
|
| 4 |
-
from gradio_client import Client, handle_file
|
| 5 |
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 6 |
-
import json
|
| 7 |
-
|
| 8 |
-
#
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
def
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
|
| 3 |
+
|
| 4 |
+
from gradio_client import Client, handle_file
|
| 5 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
# Lazy initialization to avoid crash if Space is down at import time
|
| 9 |
+
_svision_client = None
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _get_svision_client():
|
| 13 |
+
"""Get or create the svision client (lazy initialization)."""
|
| 14 |
+
global _svision_client
|
| 15 |
+
if _svision_client is None:
|
| 16 |
+
_svision_client = Client("VeuReu/svision")
|
| 17 |
+
return _svision_client
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def extract_scenes(video_path: str, threshold: float = 30.0, offset_frames: int = 5, crop_ratio: float = 0.1):
|
| 21 |
+
"""
|
| 22 |
+
Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.
|
| 23 |
+
|
| 24 |
+
Parameters
|
| 25 |
+
----------
|
| 26 |
+
video_path : str
|
| 27 |
+
Path to the input video file.
|
| 28 |
+
threshold : float, optional
|
| 29 |
+
Scene change detection threshold; higher values make detection less sensitive.
|
| 30 |
+
offset_frames : int, optional
|
| 31 |
+
Number of frames to include before and after a detected scene boundary.
|
| 32 |
+
crop_ratio : float, optional
|
| 33 |
+
Ratio for cropping borders before performing scene detection.
|
| 34 |
+
|
| 35 |
+
Returns
|
| 36 |
+
-------
|
| 37 |
+
Any
|
| 38 |
+
Response returned by the remote /scenes_extraction endpoint.
|
| 39 |
+
"""
|
| 40 |
+
result = _get_svision_client().predict(
|
| 41 |
+
video_file={"video": handle_file(video_path)},
|
| 42 |
+
threshold=threshold,
|
| 43 |
+
offset_frames=offset_frames,
|
| 44 |
+
crop_ratio=crop_ratio,
|
| 45 |
+
api_name="/scenes_extraction"
|
| 46 |
+
)
|
| 47 |
+
return result
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def keyframes_every_second_extraction(video_path: str):
|
| 51 |
+
"""
|
| 52 |
+
Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.
|
| 53 |
+
|
| 54 |
+
Parameters
|
| 55 |
+
----------
|
| 56 |
+
video_path : str
|
| 57 |
+
Path to the input video file.
|
| 58 |
+
|
| 59 |
+
Returns
|
| 60 |
+
-------
|
| 61 |
+
Any
|
| 62 |
+
Response returned by the remote /keyframes_every_second_extraction endpoint.
|
| 63 |
+
"""
|
| 64 |
+
result = _get_svision_client().predict(
|
| 65 |
+
video_path={"video": handle_file(video_path)},
|
| 66 |
+
api_name="/keyframes_every_second_extraction"
|
| 67 |
+
)
|
| 68 |
+
return result
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 72 |
+
"""
|
| 73 |
+
Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.
|
| 74 |
+
|
| 75 |
+
This function sends an image together with metadata and face collection data
|
| 76 |
+
to perform OCR, face detection, and annotation enhancement.
|
| 77 |
+
|
| 78 |
+
Parameters
|
| 79 |
+
----------
|
| 80 |
+
imagen_path : str
|
| 81 |
+
Path to the input image file.
|
| 82 |
+
informacion_image : Dict[str, Any]
|
| 83 |
+
Dictionary containing image-related metadata.
|
| 84 |
+
face_col : List[Dict[str, Any]]
|
| 85 |
+
List of dictionaries representing detected faces or face metadata.
|
| 86 |
+
|
| 87 |
+
Returns
|
| 88 |
+
-------
|
| 89 |
+
Dict[str, Any]
|
| 90 |
+
Processed output containing OCR results, face detection data, and annotations.
|
| 91 |
+
"""
|
| 92 |
+
informacion_image_str = json.dumps(informacion_image)
|
| 93 |
+
face_col_str = json.dumps(face_col)
|
| 94 |
+
result = _get_svision_client().predict(
|
| 95 |
+
image=handle_file(imagen_path),
|
| 96 |
+
informacion_image=informacion_image_str,
|
| 97 |
+
face_col=face_col_str,
|
| 98 |
+
api_name="/add_ocr_and_faces"
|
| 99 |
+
)
|
| 100 |
+
return result
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def extract_descripcion_escena(imagen_path: str) -> str:
|
| 104 |
+
"""
|
| 105 |
+
Call the /describe_images endpoint of the remote Space VeuReu/svision.
|
| 106 |
+
|
| 107 |
+
This function sends an image to receive a textual description of its visual content.
|
| 108 |
+
|
| 109 |
+
Parameters
|
| 110 |
+
----------
|
| 111 |
+
imagen_path : str
|
| 112 |
+
Path to the input image file.
|
| 113 |
+
|
| 114 |
+
Returns
|
| 115 |
+
-------
|
| 116 |
+
str
|
| 117 |
+
Description generated for the given image.
|
| 118 |
+
"""
|
| 119 |
+
result = _get_svision_client().predict(
|
| 120 |
+
images=[{"image": handle_file(imagen_path)}],
|
| 121 |
+
api_name="/describe_images"
|
| 122 |
+
)
|
| 123 |
+
return result
|