VeuReu commited on
Commit
c8c329a
·
verified ·
1 Parent(s): c27f43c

Upload 3 files

Browse files
Files changed (3) hide show
  1. asr_client.py +140 -132
  2. preprocessing_router.py +308 -6
  3. svision_client.py +123 -116
asr_client.py CHANGED
@@ -1,132 +1,140 @@
1
- import os
2
- os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3
-
4
- from gradio_client import Client, handle_file
5
- from typing import Any, Dict, List
6
- from PIL import Image
7
- import json
8
-
9
- # Connect to the remote Space
10
- asr_client = Client("VeuReu/asr")
11
-
12
-
13
- def extract_audio_from_video(video_path: str) -> str:
14
- """
15
- Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
16
-
17
- This function uploads a video file to the remote ASR service and extracts its audio track.
18
-
19
- Parameters
20
- ----------
21
- video_path : str
22
- Path to the input video file from which audio will be extracted.
23
-
24
- Returns
25
- -------
26
- str
27
- Path or identifier of the extracted audio file returned by the remote service.
28
- """
29
- result = asr_client.predict(
30
- video_file={"video": handle_file(video_path)},
31
- api_name="/extract_audio_ffmpeg"
32
- )
33
- return result
34
-
35
-
36
- def diarize_audio(audio_path: str) -> str:
37
- """
38
- Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
39
-
40
- This function performs speaker diarization, identifying segments of speech
41
- belonging to different speakers in the audio file.
42
-
43
- Parameters
44
- ----------
45
- audio_path : str
46
- Path to the audio file to be diarized.
47
-
48
- Returns
49
- -------
50
- str
51
- JSON-like diarization output containing speaker segments and timings.
52
- """
53
- result = asr_client.predict(
54
- wav_archivo=handle_file(audio_path),
55
- api_name="/diaritzar_audio"
56
- )
57
- return result
58
-
59
-
60
- def transcribe_long_audio(audio_path: str) -> str:
61
- """
62
- Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
63
-
64
- Designed for long audio recordings, this function sends the audio to the ASR model
65
- optimized for processing extended durations.
66
-
67
- Parameters
68
- ----------
69
- audio_path : str
70
- Path to the long audio file to be transcribed.
71
-
72
- Returns
73
- -------
74
- str
75
- Transcribed text returned by the remote ASR service.
76
- """
77
- result = asr_client.predict(
78
- wav_path=handle_file(audio_path),
79
- api_name="/transcribe_long_audio"
80
- )
81
- return result
82
-
83
-
84
- def transcribe_short_audio(audio_path: str) -> str:
85
- """
86
- Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
87
-
88
- This function is optimized for short-duration audio samples and produces fast transcriptions.
89
-
90
- Parameters
91
- ----------
92
- audio_path : str
93
- Path to the short audio file to be transcribed.
94
-
95
- Returns
96
- -------
97
- str
98
- Transcribed text returned by the remote service.
99
- """
100
- result = asr_client.predict(
101
- wav_path=handle_file(audio_path),
102
- api_name="/transcribe_wav"
103
- )
104
- return result
105
-
106
-
107
- def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
108
- """
109
- Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
110
-
111
- This function attempts to identify which known speaker (from a provided
112
- collection of voice profiles) appears in the given audio clip.
113
-
114
- Parameters
115
- ----------
116
- clip_path : str
117
- Path to the audio clip whose speaker is to be identified.
118
- voice_col : List[Dict[str, Any]]
119
- List of dictionaries containing metadata or embeddings for known voices.
120
-
121
- Returns
122
- -------
123
- Any
124
- Output returned by the remote speaker identification model.
125
- """
126
- voice_col_str = json.dumps(voice_col)
127
- result = asr_client.predict(
128
- wav_archivo=handle_file(clip_path),
129
- voice_col=voice_col_str,
130
- api_name="/identificar_veu"
131
- )
132
- return result
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3
+
4
+ from gradio_client import Client, handle_file
5
+ from typing import Any, Dict, List
6
+ from PIL import Image
7
+ import json
8
+
9
+ # Lazy initialization to avoid crash if Space is down at import time
10
+ _asr_client = None
11
+
12
+
13
+ def _get_asr_client():
14
+ """Get or create the ASR client (lazy initialization)."""
15
+ global _asr_client
16
+ if _asr_client is None:
17
+ _asr_client = Client("VeuReu/asr")
18
+ return _asr_client
19
+
20
+
21
+ def extract_audio_from_video(video_path: str) -> str:
22
+ """
23
+ Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
24
+
25
+ This function uploads a video file to the remote ASR service and extracts its audio track.
26
+
27
+ Parameters
28
+ ----------
29
+ video_path : str
30
+ Path to the input video file from which audio will be extracted.
31
+
32
+ Returns
33
+ -------
34
+ str
35
+ Path or identifier of the extracted audio file returned by the remote service.
36
+ """
37
+ result = _get_asr_client().predict(
38
+ video_file={"video": handle_file(video_path)},
39
+ api_name="/extract_audio_ffmpeg"
40
+ )
41
+ return result
42
+
43
+
44
+ def diarize_audio(audio_path: str) -> str:
45
+ """
46
+ Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
47
+
48
+ This function performs speaker diarization, identifying segments of speech
49
+ belonging to different speakers in the audio file.
50
+
51
+ Parameters
52
+ ----------
53
+ audio_path : str
54
+ Path to the audio file to be diarized.
55
+
56
+ Returns
57
+ -------
58
+ str
59
+ JSON-like diarization output containing speaker segments and timings.
60
+ """
61
+ result = _get_asr_client().predict(
62
+ wav_archivo=handle_file(audio_path),
63
+ api_name="/diaritzar_audio"
64
+ )
65
+ return result
66
+
67
+
68
+ def transcribe_long_audio(audio_path: str) -> str:
69
+ """
70
+ Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
71
+
72
+ Designed for long audio recordings, this function sends the audio to the ASR model
73
+ optimized for processing extended durations.
74
+
75
+ Parameters
76
+ ----------
77
+ audio_path : str
78
+ Path to the long audio file to be transcribed.
79
+
80
+ Returns
81
+ -------
82
+ str
83
+ Transcribed text returned by the remote ASR service.
84
+ """
85
+ result = _get_asr_client().predict(
86
+ wav_path=handle_file(audio_path),
87
+ api_name="/transcribe_long_audio"
88
+ )
89
+ return result
90
+
91
+
92
+ def transcribe_short_audio(audio_path: str) -> str:
93
+ """
94
+ Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
95
+
96
+ This function is optimized for short-duration audio samples and produces fast transcriptions.
97
+
98
+ Parameters
99
+ ----------
100
+ audio_path : str
101
+ Path to the short audio file to be transcribed.
102
+
103
+ Returns
104
+ -------
105
+ str
106
+ Transcribed text returned by the remote service.
107
+ """
108
+ result = _get_asr_client().predict(
109
+ wav_path=handle_file(audio_path),
110
+ api_name="/transcribe_wav"
111
+ )
112
+ return result
113
+
114
+
115
+ def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
116
+ """
117
+ Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
118
+
119
+ This function attempts to identify which known speaker (from a provided
120
+ collection of voice profiles) appears in the given audio clip.
121
+
122
+ Parameters
123
+ ----------
124
+ clip_path : str
125
+ Path to the audio clip whose speaker is to be identified.
126
+ voice_col : List[Dict[str, Any]]
127
+ List of dictionaries containing metadata or embeddings for known voices.
128
+
129
+ Returns
130
+ -------
131
+ Any
132
+ Output returned by the remote speaker identification model.
133
+ """
134
+ voice_col_str = json.dumps(voice_col)
135
+ result = _get_asr_client().predict(
136
+ wav_archivo=handle_file(clip_path),
137
+ voice_col=voice_col_str,
138
+ api_name="/identificar_veu"
139
+ )
140
+ return result
preprocessing_router.py CHANGED
@@ -5,10 +5,12 @@ from fastapi.responses import FileResponse
5
  from pathlib import Path
6
  from datetime import datetime
7
  from enum import Enum
8
- from typing import Dict
9
  import shutil
10
  import os
11
  import uuid
 
 
12
 
13
  from video_processing import process_video_pipeline
14
  from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
@@ -40,6 +42,70 @@ class JobStatus(str, Enum):
40
  jobs: Dict[str, dict] = {}
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  router = APIRouter(tags=["Preprocessing Manager"])
44
 
45
 
@@ -346,9 +412,245 @@ async def detect_scenes(
346
 
347
 
348
  def process_video_job(job_id: str):
349
- # Reutiliza exactamente la implementación actual de process_video_job
350
- # que está hoy en engine/api.py. No la duplicamos completamente aquí
351
- # por longitud, pero el contenido debe moverse tal cual a esta función.
352
- from engine.api import process_video_job as _orig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
 
354
- return _orig(job_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from pathlib import Path
6
  from datetime import datetime
7
  from enum import Enum
8
+ from typing import Dict, Any
9
  import shutil
10
  import os
11
  import uuid
12
+ import numpy as np
13
+ import cv2
14
 
15
  from video_processing import process_video_pipeline
16
  from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
 
42
  jobs: Dict[str, dict] = {}
43
 
44
 
45
+ # ---------------------------------------------------------------------------
46
+ # Helper functions for face detection and clustering
47
+ # ---------------------------------------------------------------------------
48
+
49
+ def normalize_face_lighting(image):
50
+ """Normalize face brightness using CLAHE and range normalization."""
51
+ lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
52
+ l, a, b = cv2.split(lab)
53
+ clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
54
+ l_clahe = clahe.apply(l)
55
+ l_min, l_max = l_clahe.min(), l_clahe.max()
56
+ if l_max > l_min:
57
+ l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
58
+ else:
59
+ l_normalized = l_clahe
60
+ l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
61
+ lab_normalized = cv2.merge([l_normalized, a, b])
62
+ normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
63
+ return normalized
64
+
65
+
66
+ def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
67
+ """Hierarchical clustering with silhouette score and minimum cluster size."""
68
+ from scipy.cluster.hierarchy import linkage, fcluster
69
+ from sklearn.metrics import silhouette_score
70
+ from collections import Counter
71
+
72
+ if len(X) == 0:
73
+ return np.array([])
74
+ if len(X) < min_cluster_size:
75
+ return np.full(len(X), -1, dtype=int)
76
+
77
+ Z = linkage(X, method='average', metric='cosine')
78
+ best_n_clusters = 2
79
+ best_score = -1
80
+ max_to_try = min(max_groups, len(X) - 1)
81
+
82
+ if max_to_try >= 2:
83
+ for n_clusters in range(2, max_to_try + 1):
84
+ trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
85
+ trial_counts = Counter(trial_labels)
86
+ valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
87
+ if valid_clusters >= 2:
88
+ try:
89
+ score = silhouette_score(X, trial_labels, metric='cosine')
90
+ penalty = 0.14 - (sensitivity * 0.13)
91
+ adjusted_score = score - (n_clusters * penalty)
92
+ if adjusted_score > best_score:
93
+ best_score = adjusted_score
94
+ best_n_clusters = n_clusters
95
+ except Exception:
96
+ pass
97
+
98
+ labels = fcluster(Z, t=best_n_clusters, criterion='maxclust') - 1
99
+ label_counts = Counter(labels)
100
+ filtered_labels = []
101
+ for lbl in labels:
102
+ if label_counts[lbl] >= min_cluster_size:
103
+ filtered_labels.append(lbl)
104
+ else:
105
+ filtered_labels.append(-1)
106
+ return np.array(filtered_labels, dtype=int)
107
+
108
+
109
  router = APIRouter(tags=["Preprocessing Manager"])
110
 
111
 
 
412
 
413
 
414
  def process_video_job(job_id: str):
415
+ """Process video job in background: detect faces, cluster, validate."""
416
+ try:
417
+ job = jobs[job_id]
418
+ print(f"[{job_id}] Iniciando procesamiento...")
419
+
420
+ job["status"] = JobStatus.PROCESSING
421
+
422
+ video_path = job["video_path"]
423
+ video_name = job["video_name"]
424
+ max_groups = int(job.get("max_groups", 5))
425
+ min_cluster_size = int(job.get("min_cluster_size", 3))
426
+ face_sensitivity = float(job.get("face_sensitivity", 0.5))
427
+
428
+ base = TEMP_ROOT / video_name
429
+ base.mkdir(parents=True, exist_ok=True)
430
+ print(f"[{job_id}] Directorio base: {base}")
431
+
432
+ try:
433
+ print(f"[{job_id}] Iniciando detección de personajes...")
434
+ try:
435
+ import face_recognition
436
+ _use_fr = True
437
+ print(f"[{job_id}] face_recognition disponible: CPU")
438
+ except Exception:
439
+ face_recognition = None
440
+ _use_fr = False
441
+ print(f"[{job_id}] face_recognition no disponible. Intentando DeepFace fallback.")
442
+ try:
443
+ from deepface import DeepFace
444
+ except Exception:
445
+ DeepFace = None
446
+
447
+ cap = cv2.VideoCapture(video_path)
448
+ if not cap.isOpened():
449
+ raise RuntimeError("No se pudo abrir el vídeo para extracción de caras")
450
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
451
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
452
+ max_samples = job.get("max_frames", 100)
453
+
454
+ if total_frames > 0:
455
+ frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
456
+ else:
457
+ frame_indices = []
458
+ print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
459
+
460
+ faces_root = base / "faces_raw"
461
+ faces_root.mkdir(parents=True, exist_ok=True)
462
+ embeddings: list[list[float]] = []
463
+ crops_meta: list[dict] = []
464
+
465
+ saved_count = 0
466
+ frames_processed = 0
467
+ frames_with_faces = 0
468
+
469
+ for frame_idx in frame_indices:
470
+ cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
471
+ ret2, frame = cap.read()
472
+ if not ret2:
473
+ continue
474
+ frames_processed += 1
475
+ frame_normalized = normalize_face_lighting(frame)
476
+ rgb = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2RGB)
477
+
478
+ if _use_fr and face_recognition is not None:
479
+ boxes = face_recognition.face_locations(rgb, model="hog")
480
+ encs = face_recognition.face_encodings(rgb, boxes)
481
+ if boxes:
482
+ frames_with_faces += 1
483
+ for (top, right, bottom, left), e in zip(boxes, encs):
484
+ crop = frame_normalized[top:bottom, left:right]
485
+ if crop.size == 0:
486
+ continue
487
+ fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
488
+ cv2.imwrite(str(faces_root / fn), crop)
489
+ e = np.array(e, dtype=float)
490
+ e = e / (np.linalg.norm(e) + 1e-9)
491
+ embeddings.append(e.astype(float).tolist())
492
+ crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
493
+ saved_count += 1
494
+ else:
495
+ if DeepFace is not None:
496
+ try:
497
+ gray = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2GRAY)
498
+ haar_path = getattr(cv2.data, 'haarcascades', None) or ''
499
+ face_cascade = cv2.CascadeClassifier(os.path.join(haar_path, 'haarcascade_frontalface_default.xml'))
500
+ boxes_haar = []
501
+ if face_cascade is not None and not face_cascade.empty():
502
+ faces_haar = face_cascade.detectMultiScale(gray, scaleFactor=1.08, minNeighbors=5, minSize=(50, 50))
503
+ for (x, y, w, h) in faces_haar:
504
+ top, left, bottom, right = max(0, y), max(0, x), min(frame.shape[0], y+h), min(frame.shape[1], x+w)
505
+ boxes_haar.append((top, right, bottom, left))
506
+
507
+ if boxes_haar:
508
+ frames_with_faces += 1
509
+
510
+ for (top, right, bottom, left) in boxes_haar:
511
+ crop = frame_normalized[top:bottom, left:right]
512
+ if crop.size == 0:
513
+ continue
514
+ fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
515
+ crop_path = faces_root / fn
516
+ cv2.imwrite(str(crop_path), crop)
517
+ reps = DeepFace.represent(img_path=str(crop_path), model_name="Facenet512", enforce_detection=False)
518
+ for r in (reps or []):
519
+ emb = r.get("embedding") if isinstance(r, dict) else r
520
+ if emb is None:
521
+ continue
522
+ emb = np.array(emb, dtype=float)
523
+ emb = emb / (np.linalg.norm(emb) + 1e-9)
524
+ embeddings.append(emb.astype(float).tolist())
525
+ crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
526
+ saved_count += 1
527
+ except Exception as _e_df:
528
+ print(f"[{job_id}] DeepFace fallback error: {_e_df}")
529
+ cap.release()
530
+
531
+ print(f"[{job_id}] ✓ Frames procesados: {frames_processed}/{len(frame_indices)}")
532
+ print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}")
533
+ print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
534
+
535
+ # Clustering
536
+ if embeddings:
537
+ Xf = np.array(embeddings)
538
+ labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
539
+ print(f"[{job_id}] Clustering: {len(set([l for l in labels if l >= 0]))} clusters")
540
+ else:
541
+ labels = []
542
+
543
+ # Build character folders with validation
544
+ try:
545
+ from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
546
+ except ImportError:
547
+ validate_and_classify_face = None
548
+ FACE_CONFIDENCE_THRESHOLD = 0.5
549
+
550
+ characters_validated: list[dict[str, Any]] = []
551
+ cluster_map: dict[int, list[int]] = {}
552
+ for idx, lbl in enumerate(labels):
553
+ if isinstance(lbl, int) and lbl >= 0:
554
+ cluster_map.setdefault(lbl, []).append(idx)
555
+
556
+ chars_dir = base / "characters"
557
+ chars_dir.mkdir(parents=True, exist_ok=True)
558
+
559
+ for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
560
+ char_id = f"char_{ci:02d}"
561
+ detections: list[dict[str, Any]] = []
562
+ for j in idxs:
563
+ meta = crops_meta[j]
564
+ file_name = meta.get("file")
565
+ if not file_name:
566
+ continue
567
+ box = meta.get("box", [0, 0, 0, 0])
568
+ area = abs(box[1] - box[3]) * abs(box[2] - box[0]) if len(box) >= 4 else 0
569
+ detections.append({"index": j, "file": file_name, "score": area, "box": box})
570
+
571
+ if not detections:
572
+ continue
573
 
574
+ detections.sort(key=lambda d: d["score"], reverse=True)
575
+ best_face = detections[0]
576
+ best_face_path = faces_root / best_face["file"]
577
+
578
+ # Validation (optional)
579
+ validation = None
580
+ if validate_and_classify_face is not None:
581
+ try:
582
+ validation = validate_and_classify_face(str(best_face_path))
583
+ except Exception:
584
+ validation = None
585
+
586
+ if validation and not validation.get("is_valid_face", True):
587
+ if validation.get("face_confidence", 1.0) < FACE_CONFIDENCE_THRESHOLD:
588
+ continue
589
+
590
+ out_dir = chars_dir / char_id
591
+ out_dir.mkdir(parents=True, exist_ok=True)
592
+
593
+ total_faces = len(detections)
594
+ max_faces_to_show = (total_faces // 2) + 1
595
+ selected = detections[:max_faces_to_show]
596
+
597
+ files: list[str] = []
598
+ file_urls: list[str] = []
599
+ for det in selected:
600
+ fname = det["file"]
601
+ src = faces_root / fname
602
+ dst = out_dir / fname
603
+ try:
604
+ shutil.copy2(src, dst)
605
+ files.append(fname)
606
+ file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
607
+ except Exception:
608
+ pass
609
+
610
+ rep = files[0] if files else None
611
+ if rep:
612
+ try:
613
+ shutil.copy2(out_dir / rep, out_dir / "representative.jpg")
614
+ except Exception:
615
+ pass
616
+
617
+ cluster_number = int(char_id.split("_")[1]) + 1
618
+ character_name = f"Cluster {cluster_number}"
619
+ gender = validation.get("gender", "Neutral") if validation else "Neutral"
620
+
621
+ characters_validated.append({
622
+ "id": char_id,
623
+ "name": character_name,
624
+ "gender": gender,
625
+ "folder": str(out_dir),
626
+ "num_faces": len(files),
627
+ "total_faces_detected": total_faces,
628
+ "image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
629
+ "face_files": file_urls,
630
+ })
631
+ print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
632
+
633
+ print(f"[{job_id}] ✓ Total: {len(characters_validated)} personajes válidos")
634
+
635
+ job["results"] = {
636
+ "characters": characters_validated,
637
+ "face_labels": labels,
638
+ "video_name": video_name,
639
+ "base_dir": str(base),
640
+ }
641
+ job["status"] = JobStatus.DONE
642
+ print(f"[{job_id}] ✓ Procesamiento completado")
643
+
644
+ except Exception as face_error:
645
+ print(f"[{job_id}] Error en detección de caras: {face_error}")
646
+ import traceback
647
+ traceback.print_exc()
648
+ job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
649
+ job["status"] = JobStatus.DONE
650
+
651
+ except Exception as e:
652
+ print(f"[{job_id}] Error general: {e}")
653
+ import traceback
654
+ traceback.print_exc()
655
+ job["status"] = JobStatus.FAILED
656
+ job["error"] = str(e)
svision_client.py CHANGED
@@ -1,116 +1,123 @@
1
- import os
2
- os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3
-
4
- from gradio_client import Client, handle_file
5
- from typing import Any, Dict, List, Optional, Tuple, Union
6
- import json
7
-
8
- # Connect to the remote Space
9
- svision_client = Client("VeuReu/svision")
10
-
11
-
12
- def extract_scenes(video_path: str, threshold: float = 30.0, offset_frames: int = 5, crop_ratio: float = 0.1):
13
- """
14
- Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.
15
-
16
- Parameters
17
- ----------
18
- video_path : str
19
- Path to the input video file.
20
- threshold : float, optional
21
- Scene change detection threshold; higher values make detection less sensitive.
22
- offset_frames : int, optional
23
- Number of frames to include before and after a detected scene boundary.
24
- crop_ratio : float, optional
25
- Ratio for cropping borders before performing scene detection.
26
-
27
- Returns
28
- -------
29
- Any
30
- Response returned by the remote /scenes_extraction endpoint.
31
- """
32
- result = svision_client.predict(
33
- video_file={"video": handle_file(video_path)},
34
- threshold=threshold,
35
- offset_frames=offset_frames,
36
- crop_ratio=crop_ratio,
37
- api_name="/scenes_extraction"
38
- )
39
- return result
40
-
41
-
42
- def keyframes_every_second_extraction(video_path: str):
43
- """
44
- Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.
45
-
46
- Parameters
47
- ----------
48
- video_path : str
49
- Path to the input video file.
50
-
51
- Returns
52
- -------
53
- Any
54
- Response returned by the remote /keyframes_every_second_extraction endpoint.
55
- """
56
- result = svision_client.predict(
57
- video_path={"video": handle_file(video_path)},
58
- api_name="/keyframes_every_second_extraction"
59
- )
60
- return result
61
-
62
-
63
- def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
64
- """
65
- Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.
66
-
67
- This function sends an image together with metadata and face collection data
68
- to perform OCR, face detection, and annotation enhancement.
69
-
70
- Parameters
71
- ----------
72
- imagen_path : str
73
- Path to the input image file.
74
- informacion_image : Dict[str, Any]
75
- Dictionary containing image-related metadata.
76
- face_col : List[Dict[str, Any]]
77
- List of dictionaries representing detected faces or face metadata.
78
-
79
- Returns
80
- -------
81
- Dict[str, Any]
82
- Processed output containing OCR results, face detection data, and annotations.
83
- """
84
- informacion_image_str = json.dumps(informacion_image)
85
- face_col_str = json.dumps(face_col)
86
- result = svision_client.predict(
87
- image=handle_file(imagen_path),
88
- informacion_image=informacion_image_str,
89
- face_col=face_col_str,
90
- api_name="/add_ocr_and_faces"
91
- )
92
- return result
93
-
94
-
95
- def extract_descripcion_escena(imagen_path: str) -> str:
96
- """
97
- Call the /describe_images endpoint of the remote Space VeuReu/svision.
98
-
99
- This function sends an image to receive a textual description of its visual content.
100
-
101
- Parameters
102
- ----------
103
- imagen_path : str
104
- Path to the input image file.
105
-
106
- Returns
107
- -------
108
- str
109
- Description generated for the given image.
110
- """
111
- print("Calling svision to describe the scene...")
112
- result = svision_client.predict(
113
- images=[{"image": handle_file(imagen_path)}],
114
- api_name="/describe_images"
115
- )
116
- return result
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3
+
4
+ from gradio_client import Client, handle_file
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
+ import json
7
+
8
+ # Lazy initialization to avoid crash if Space is down at import time
9
+ _svision_client = None
10
+
11
+
12
+ def _get_svision_client():
13
+ """Get or create the svision client (lazy initialization)."""
14
+ global _svision_client
15
+ if _svision_client is None:
16
+ _svision_client = Client("VeuReu/svision")
17
+ return _svision_client
18
+
19
+
20
+ def extract_scenes(video_path: str, threshold: float = 30.0, offset_frames: int = 5, crop_ratio: float = 0.1):
21
+ """
22
+ Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.
23
+
24
+ Parameters
25
+ ----------
26
+ video_path : str
27
+ Path to the input video file.
28
+ threshold : float, optional
29
+ Scene change detection threshold; higher values make detection less sensitive.
30
+ offset_frames : int, optional
31
+ Number of frames to include before and after a detected scene boundary.
32
+ crop_ratio : float, optional
33
+ Ratio for cropping borders before performing scene detection.
34
+
35
+ Returns
36
+ -------
37
+ Any
38
+ Response returned by the remote /scenes_extraction endpoint.
39
+ """
40
+ result = _get_svision_client().predict(
41
+ video_file={"video": handle_file(video_path)},
42
+ threshold=threshold,
43
+ offset_frames=offset_frames,
44
+ crop_ratio=crop_ratio,
45
+ api_name="/scenes_extraction"
46
+ )
47
+ return result
48
+
49
+
50
+ def keyframes_every_second_extraction(video_path: str):
51
+ """
52
+ Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.
53
+
54
+ Parameters
55
+ ----------
56
+ video_path : str
57
+ Path to the input video file.
58
+
59
+ Returns
60
+ -------
61
+ Any
62
+ Response returned by the remote /keyframes_every_second_extraction endpoint.
63
+ """
64
+ result = _get_svision_client().predict(
65
+ video_path={"video": handle_file(video_path)},
66
+ api_name="/keyframes_every_second_extraction"
67
+ )
68
+ return result
69
+
70
+
71
+ def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
72
+ """
73
+ Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.
74
+
75
+ This function sends an image together with metadata and face collection data
76
+ to perform OCR, face detection, and annotation enhancement.
77
+
78
+ Parameters
79
+ ----------
80
+ imagen_path : str
81
+ Path to the input image file.
82
+ informacion_image : Dict[str, Any]
83
+ Dictionary containing image-related metadata.
84
+ face_col : List[Dict[str, Any]]
85
+ List of dictionaries representing detected faces or face metadata.
86
+
87
+ Returns
88
+ -------
89
+ Dict[str, Any]
90
+ Processed output containing OCR results, face detection data, and annotations.
91
+ """
92
+ informacion_image_str = json.dumps(informacion_image)
93
+ face_col_str = json.dumps(face_col)
94
+ result = _get_svision_client().predict(
95
+ image=handle_file(imagen_path),
96
+ informacion_image=informacion_image_str,
97
+ face_col=face_col_str,
98
+ api_name="/add_ocr_and_faces"
99
+ )
100
+ return result
101
+
102
+
103
+ def extract_descripcion_escena(imagen_path: str) -> str:
104
+ """
105
+ Call the /describe_images endpoint of the remote Space VeuReu/svision.
106
+
107
+ This function sends an image to receive a textual description of its visual content.
108
+
109
+ Parameters
110
+ ----------
111
+ imagen_path : str
112
+ Path to the input image file.
113
+
114
+ Returns
115
+ -------
116
+ str
117
+ Description generated for the given image.
118
+ """
119
+ result = _get_svision_client().predict(
120
+ images=[{"image": handle_file(imagen_path)}],
121
+ api_name="/describe_images"
122
+ )
123
+ return result