Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on 26 days ago

Commit

c8c329a

verified ·

1 Parent(s): c27f43c

Upload 3 files

Browse files

Files changed (3) hide show

asr_client.py +140 -132
preprocessing_router.py +308 -6
svision_client.py +123 -116

asr_client.py CHANGED Viewed

@@ -1,132 +1,140 @@
-import os
-os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-from gradio_client import Client, handle_file
-from typing import Any, Dict, List
-from PIL import Image
-import json
-# Connect to the remote Space
-asr_client = Client("VeuReu/asr")
-def extract_audio_from_video(video_path: str) -> str:
-    """
-    Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
-    This function uploads a video file to the remote ASR service and extracts its audio track.
-    Parameters
-    ----------
-    video_path : str
-        Path to the input video file from which audio will be extracted.
-    Returns
-    -------
-    str
-        Path or identifier of the extracted audio file returned by the remote service.
-    """
-    result = asr_client.predict(
-        video_file={"video": handle_file(video_path)},
-        api_name="/extract_audio_ffmpeg"
-    )
-    return result
-def diarize_audio(audio_path: str) -> str:
-    """
-    Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
-    This function performs speaker diarization, identifying segments of speech
-    belonging to different speakers in the audio file.
-    Parameters
-    ----------
-    audio_path : str
-        Path to the audio file to be diarized.
-    Returns
-    -------
-    str
-        JSON-like diarization output containing speaker segments and timings.
-    """
-    result = asr_client.predict(
-        wav_archivo=handle_file(audio_path),
-        api_name="/diaritzar_audio"
-    )
-    return result
-def transcribe_long_audio(audio_path: str) -> str:
-    """
-    Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
-    Designed for long audio recordings, this function sends the audio to the ASR model
-    optimized for processing extended durations.
-    Parameters
-    ----------
-    audio_path : str
-        Path to the long audio file to be transcribed.
-    Returns
-    -------
-    str
-        Transcribed text returned by the remote ASR service.
-    """
-    result = asr_client.predict(
-        wav_path=handle_file(audio_path),
-        api_name="/transcribe_long_audio"
-    )
-    return result
-def transcribe_short_audio(audio_path: str) -> str:
-    """
-    Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
-    This function is optimized for short-duration audio samples and produces fast transcriptions.
-    Parameters
-    ----------
-    audio_path : str
-        Path to the short audio file to be transcribed.
-    Returns
-    -------
-    str
-        Transcribed text returned by the remote service.
-    """
-    result = asr_client.predict(
-        wav_path=handle_file(audio_path),
-        api_name="/transcribe_wav"
-    )
-    return result
-def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
-    """
-    Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
-    This function attempts to identify which known speaker (from a provided
-    collection of voice profiles) appears in the given audio clip.
-    Parameters
-    ----------
-    clip_path : str
-        Path to the audio clip whose speaker is to be identified.
-    voice_col : List[Dict[str, Any]]
-        List of dictionaries containing metadata or embeddings for known voices.
-    Returns
-    -------
-    Any
-        Output returned by the remote speaker identification model.
-    """
-    voice_col_str = json.dumps(voice_col)
-    result = asr_client.predict(
-        wav_archivo=handle_file(clip_path),
-        voice_col=voice_col_str,
-        api_name="/identificar_veu"
-    )
-    return result

+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+from gradio_client import Client, handle_file
+from typing import Any, Dict, List
+from PIL import Image
+import json
+# Lazy initialization to avoid crash if Space is down at import time
+_asr_client = None
+def _get_asr_client():
+    """Get or create the ASR client (lazy initialization)."""
+    global _asr_client
+    if _asr_client is None:
+        _asr_client = Client("VeuReu/asr")
+    return _asr_client
+def extract_audio_from_video(video_path: str) -> str:
+    """
+    Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
+    This function uploads a video file to the remote ASR service and extracts its audio track.
+    Parameters
+    ----------
+    video_path : str
+        Path to the input video file from which audio will be extracted.
+    Returns
+    -------
+    str
+        Path or identifier of the extracted audio file returned by the remote service.
+    """
+    result = _get_asr_client().predict(
+        video_file={"video": handle_file(video_path)},
+        api_name="/extract_audio_ffmpeg"
+    )
+    return result
+def diarize_audio(audio_path: str) -> str:
+    """
+    Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
+    This function performs speaker diarization, identifying segments of speech
+    belonging to different speakers in the audio file.
+    Parameters
+    ----------
+    audio_path : str
+        Path to the audio file to be diarized.
+    Returns
+    -------
+    str
+        JSON-like diarization output containing speaker segments and timings.
+    """
+    result = _get_asr_client().predict(
+        wav_archivo=handle_file(audio_path),
+        api_name="/diaritzar_audio"
+    )
+    return result
+def transcribe_long_audio(audio_path: str) -> str:
+    """
+    Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
+    Designed for long audio recordings, this function sends the audio to the ASR model
+    optimized for processing extended durations.
+    Parameters
+    ----------
+    audio_path : str
+        Path to the long audio file to be transcribed.
+    Returns
+    -------
+    str
+        Transcribed text returned by the remote ASR service.
+    """
+    result = _get_asr_client().predict(
+        wav_path=handle_file(audio_path),
+        api_name="/transcribe_long_audio"
+    )
+    return result
+def transcribe_short_audio(audio_path: str) -> str:
+    """
+    Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
+    This function is optimized for short-duration audio samples and produces fast transcriptions.
+    Parameters
+    ----------
+    audio_path : str
+        Path to the short audio file to be transcribed.
+    Returns
+    -------
+    str
+        Transcribed text returned by the remote service.
+    """
+    result = _get_asr_client().predict(
+        wav_path=handle_file(audio_path),
+        api_name="/transcribe_wav"
+    )
+    return result
+def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
+    """
+    Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
+    This function attempts to identify which known speaker (from a provided
+    collection of voice profiles) appears in the given audio clip.
+    Parameters
+    ----------
+    clip_path : str
+        Path to the audio clip whose speaker is to be identified.
+    voice_col : List[Dict[str, Any]]
+        List of dictionaries containing metadata or embeddings for known voices.
+    Returns
+    -------
+    Any
+        Output returned by the remote speaker identification model.
+    """
+    voice_col_str = json.dumps(voice_col)
+    result = _get_asr_client().predict(
+        wav_archivo=handle_file(clip_path),
+        voice_col=voice_col_str,
+        api_name="/identificar_veu"
+    )
+    return result

preprocessing_router.py CHANGED Viewed

@@ -5,10 +5,12 @@ from fastapi.responses import FileResponse
 from pathlib import Path
 from datetime import datetime
 from enum import Enum
-from typing import Dict
 import shutil
 import os
 import uuid
 from video_processing import process_video_pipeline
 from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
@@ -40,6 +42,70 @@ class JobStatus(str, Enum):
 jobs: Dict[str, dict] = {}
 router = APIRouter(tags=["Preprocessing Manager"])
@@ -346,9 +412,245 @@ async def detect_scenes(
 def process_video_job(job_id: str):
-    # Reutiliza exactamente la implementación actual de process_video_job
-    # que está hoy en engine/api.py. No la duplicamos completamente aquí
-    # por longitud, pero el contenido debe moverse tal cual a esta función.
-    from engine.api import process_video_job as _orig
-    return _orig(job_id)

 from pathlib import Path
 from datetime import datetime
 from enum import Enum
+from typing import Dict, Any
 import shutil
 import os
 import uuid
+import numpy as np
+import cv2
 from video_processing import process_video_pipeline
 from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments, VoiceEmbedder
 jobs: Dict[str, dict] = {}
+# ---------------------------------------------------------------------------
+# Helper functions for face detection and clustering
+# ---------------------------------------------------------------------------
+def normalize_face_lighting(image):
+    """Normalize face brightness using CLAHE and range normalization."""
+    lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
+    l, a, b = cv2.split(lab)
+    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
+    l_clahe = clahe.apply(l)
+    l_min, l_max = l_clahe.min(), l_clahe.max()
+    if l_max > l_min:
+        l_normalized = ((l_clahe - l_min) * 255.0 / (l_max - l_min)).astype(np.uint8)
+    else:
+        l_normalized = l_clahe
+    l_normalized = cv2.GaussianBlur(l_normalized, (3, 3), 0)
+    lab_normalized = cv2.merge([l_normalized, a, b])
+    normalized = cv2.cvtColor(lab_normalized, cv2.COLOR_LAB2BGR)
+    return normalized
+def hierarchical_cluster_with_min_size(X, max_groups: int, min_cluster_size: int, sensitivity: float = 0.5) -> np.ndarray:
+    """Hierarchical clustering with silhouette score and minimum cluster size."""
+    from scipy.cluster.hierarchy import linkage, fcluster
+    from sklearn.metrics import silhouette_score
+    from collections import Counter
+    if len(X) == 0:
+        return np.array([])
+    if len(X) < min_cluster_size:
+        return np.full(len(X), -1, dtype=int)
+    Z = linkage(X, method='average', metric='cosine')
+    best_n_clusters = 2
+    best_score = -1
+    max_to_try = min(max_groups, len(X) - 1)
+    if max_to_try >= 2:
+        for n_clusters in range(2, max_to_try + 1):
+            trial_labels = fcluster(Z, t=n_clusters, criterion='maxclust') - 1
+            trial_counts = Counter(trial_labels)
+            valid_clusters = sum(1 for count in trial_counts.values() if count >= min_cluster_size)
+            if valid_clusters >= 2:
+                try:
+                    score = silhouette_score(X, trial_labels, metric='cosine')
+                    penalty = 0.14 - (sensitivity * 0.13)
+                    adjusted_score = score - (n_clusters * penalty)
+                    if adjusted_score > best_score:
+                        best_score = adjusted_score
+                        best_n_clusters = n_clusters
+                except Exception:
+                    pass
+    labels = fcluster(Z, t=best_n_clusters, criterion='maxclust') - 1
+    label_counts = Counter(labels)
+    filtered_labels = []
+    for lbl in labels:
+        if label_counts[lbl] >= min_cluster_size:
+            filtered_labels.append(lbl)
+        else:
+            filtered_labels.append(-1)
+    return np.array(filtered_labels, dtype=int)
 router = APIRouter(tags=["Preprocessing Manager"])
 def process_video_job(job_id: str):
+    """Process video job in background: detect faces, cluster, validate."""
+    try:
+        job = jobs[job_id]
+        print(f"[{job_id}] Iniciando procesamiento...")
+        job["status"] = JobStatus.PROCESSING
+        video_path = job["video_path"]
+        video_name = job["video_name"]
+        max_groups = int(job.get("max_groups", 5))
+        min_cluster_size = int(job.get("min_cluster_size", 3))
+        face_sensitivity = float(job.get("face_sensitivity", 0.5))
+        base = TEMP_ROOT / video_name
+        base.mkdir(parents=True, exist_ok=True)
+        print(f"[{job_id}] Directorio base: {base}")
+        try:
+            print(f"[{job_id}] Iniciando detección de personajes...")
+            try:
+                import face_recognition
+                _use_fr = True
+                print(f"[{job_id}] face_recognition disponible: CPU")
+            except Exception:
+                face_recognition = None
+                _use_fr = False
+                print(f"[{job_id}] face_recognition no disponible. Intentando DeepFace fallback.")
+                try:
+                    from deepface import DeepFace
+                except Exception:
+                    DeepFace = None
+            cap = cv2.VideoCapture(video_path)
+            if not cap.isOpened():
+                raise RuntimeError("No se pudo abrir el vídeo para extracción de caras")
+            fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+            total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
+            max_samples = job.get("max_frames", 100)
+            if total_frames > 0:
+                frame_indices = sorted(set(np.linspace(0, max(0, total_frames - 1), num=min(max_samples, max(1, total_frames)), dtype=int).tolist()))
+            else:
+                frame_indices = []
+            print(f"[{job_id}] Total frames: {total_frames}, FPS: {fps:.2f}, Muestreando {len(frame_indices)} frames")
+            faces_root = base / "faces_raw"
+            faces_root.mkdir(parents=True, exist_ok=True)
+            embeddings: list[list[float]] = []
+            crops_meta: list[dict] = []
+            saved_count = 0
+            frames_processed = 0
+            frames_with_faces = 0
+            for frame_idx in frame_indices:
+                cap.set(cv2.CAP_PROP_POS_FRAMES, int(frame_idx))
+                ret2, frame = cap.read()
+                if not ret2:
+                    continue
+                frames_processed += 1
+                frame_normalized = normalize_face_lighting(frame)
+                rgb = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2RGB)
+                if _use_fr and face_recognition is not None:
+                    boxes = face_recognition.face_locations(rgb, model="hog")
+                    encs = face_recognition.face_encodings(rgb, boxes)
+                    if boxes:
+                        frames_with_faces += 1
+                    for (top, right, bottom, left), e in zip(boxes, encs):
+                        crop = frame_normalized[top:bottom, left:right]
+                        if crop.size == 0:
+                            continue
+                        fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
+                        cv2.imwrite(str(faces_root / fn), crop)
+                        e = np.array(e, dtype=float)
+                        e = e / (np.linalg.norm(e) + 1e-9)
+                        embeddings.append(e.astype(float).tolist())
+                        crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
+                        saved_count += 1
+                else:
+                    if DeepFace is not None:
+                        try:
+                            gray = cv2.cvtColor(frame_normalized, cv2.COLOR_BGR2GRAY)
+                            haar_path = getattr(cv2.data, 'haarcascades', None) or ''
+                            face_cascade = cv2.CascadeClassifier(os.path.join(haar_path, 'haarcascade_frontalface_default.xml'))
+                            boxes_haar = []
+                            if face_cascade is not None and not face_cascade.empty():
+                                faces_haar = face_cascade.detectMultiScale(gray, scaleFactor=1.08, minNeighbors=5, minSize=(50, 50))
+                                for (x, y, w, h) in faces_haar:
+                                    top, left, bottom, right = max(0, y), max(0, x), min(frame.shape[0], y+h), min(frame.shape[1], x+w)
+                                    boxes_haar.append((top, right, bottom, left))
+                            if boxes_haar:
+                                frames_with_faces += 1
+                            for (top, right, bottom, left) in boxes_haar:
+                                crop = frame_normalized[top:bottom, left:right]
+                                if crop.size == 0:
+                                    continue
+                                fn = f"face_{frame_idx:06d}_{saved_count:03d}.jpg"
+                                crop_path = faces_root / fn
+                                cv2.imwrite(str(crop_path), crop)
+                                reps = DeepFace.represent(img_path=str(crop_path), model_name="Facenet512", enforce_detection=False)
+                                for r in (reps or []):
+                                    emb = r.get("embedding") if isinstance(r, dict) else r
+                                    if emb is None:
+                                        continue
+                                    emb = np.array(emb, dtype=float)
+                                    emb = emb / (np.linalg.norm(emb) + 1e-9)
+                                    embeddings.append(emb.astype(float).tolist())
+                                    crops_meta.append({"file": fn, "frame": frame_idx, "box": [int(top), int(right), int(bottom), int(left)]})
+                                    saved_count += 1
+                        except Exception as _e_df:
+                            print(f"[{job_id}] DeepFace fallback error: {_e_df}")
+            cap.release()
+            print(f"[{job_id}] ✓ Frames procesados: {frames_processed}/{len(frame_indices)}")
+            print(f"[{job_id}] ✓ Frames con caras: {frames_with_faces}")
+            print(f"[{job_id}] ✓ Caras detectadas: {len(embeddings)}")
+            # Clustering
+            if embeddings:
+                Xf = np.array(embeddings)
+                labels = hierarchical_cluster_with_min_size(Xf, max_groups, min_cluster_size, face_sensitivity).tolist()
+                print(f"[{job_id}] Clustering: {len(set([l for l in labels if l >= 0]))} clusters")
+            else:
+                labels = []
+            # Build character folders with validation
+            try:
+                from face_classifier import validate_and_classify_face, FACE_CONFIDENCE_THRESHOLD
+            except ImportError:
+                validate_and_classify_face = None
+                FACE_CONFIDENCE_THRESHOLD = 0.5
+            characters_validated: list[dict[str, Any]] = []
+            cluster_map: dict[int, list[int]] = {}
+            for idx, lbl in enumerate(labels):
+                if isinstance(lbl, int) and lbl >= 0:
+                    cluster_map.setdefault(lbl, []).append(idx)
+            chars_dir = base / "characters"
+            chars_dir.mkdir(parents=True, exist_ok=True)
+            for ci, idxs in sorted(cluster_map.items(), key=lambda x: x[0]):
+                char_id = f"char_{ci:02d}"
+                detections: list[dict[str, Any]] = []
+                for j in idxs:
+                    meta = crops_meta[j]
+                    file_name = meta.get("file")
+                    if not file_name:
+                        continue
+                    box = meta.get("box", [0, 0, 0, 0])
+                    area = abs(box[1] - box[3]) * abs(box[2] - box[0]) if len(box) >= 4 else 0
+                    detections.append({"index": j, "file": file_name, "score": area, "box": box})
+                if not detections:
+                    continue
+                detections.sort(key=lambda d: d["score"], reverse=True)
+                best_face = detections[0]
+                best_face_path = faces_root / best_face["file"]
+                # Validation (optional)
+                validation = None
+                if validate_and_classify_face is not None:
+                    try:
+                        validation = validate_and_classify_face(str(best_face_path))
+                    except Exception:
+                        validation = None
+                if validation and not validation.get("is_valid_face", True):
+                    if validation.get("face_confidence", 1.0) < FACE_CONFIDENCE_THRESHOLD:
+                        continue
+                out_dir = chars_dir / char_id
+                out_dir.mkdir(parents=True, exist_ok=True)
+                total_faces = len(detections)
+                max_faces_to_show = (total_faces // 2) + 1
+                selected = detections[:max_faces_to_show]
+                files: list[str] = []
+                file_urls: list[str] = []
+                for det in selected:
+                    fname = det["file"]
+                    src = faces_root / fname
+                    dst = out_dir / fname
+                    try:
+                        shutil.copy2(src, dst)
+                        files.append(fname)
+                        file_urls.append(f"/files/{video_name}/{char_id}/{fname}")
+                    except Exception:
+                        pass
+                rep = files[0] if files else None
+                if rep:
+                    try:
+                        shutil.copy2(out_dir / rep, out_dir / "representative.jpg")
+                    except Exception:
+                        pass
+                cluster_number = int(char_id.split("_")[1]) + 1
+                character_name = f"Cluster {cluster_number}"
+                gender = validation.get("gender", "Neutral") if validation else "Neutral"
+                characters_validated.append({
+                    "id": char_id,
+                    "name": character_name,
+                    "gender": gender,
+                    "folder": str(out_dir),
+                    "num_faces": len(files),
+                    "total_faces_detected": total_faces,
+                    "image_url": f"/files/{video_name}/{char_id}/representative.jpg" if rep else "",
+                    "face_files": file_urls,
+                })
+                print(f"[{job_id}] ✓ Cluster {char_id}: {len(files)} caras")
+            print(f"[{job_id}] ✓ Total: {len(characters_validated)} personajes válidos")
+            job["results"] = {
+                "characters": characters_validated,
+                "face_labels": labels,
+                "video_name": video_name,
+                "base_dir": str(base),
+            }
+            job["status"] = JobStatus.DONE
+            print(f"[{job_id}] ✓ Procesamiento completado")
+        except Exception as face_error:
+            print(f"[{job_id}] Error en detección de caras: {face_error}")
+            import traceback
+            traceback.print_exc()
+            job["results"] = {"characters": [], "face_labels": [], "video_name": video_name, "base_dir": str(base)}
+            job["status"] = JobStatus.DONE
+    except Exception as e:
+        print(f"[{job_id}] Error general: {e}")
+        import traceback
+        traceback.print_exc()
+        job["status"] = JobStatus.FAILED
+        job["error"] = str(e)

svision_client.py CHANGED Viewed

@@ -1,116 +1,123 @@
-import os
-os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-from gradio_client import Client, handle_file
-from typing import Any, Dict, List, Optional, Tuple, Union
-import json
-# Connect to the remote Space
-svision_client = Client("VeuReu/svision")
-def extract_scenes(video_path: str, threshold: float = 30.0, offset_frames: int = 5, crop_ratio: float = 0.1):
-    """
-    Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.
-    Parameters
-    ----------
-    video_path : str
-        Path to the input video file.
-    threshold : float, optional
-        Scene change detection threshold; higher values make detection less sensitive.
-    offset_frames : int, optional
-        Number of frames to include before and after a detected scene boundary.
-    crop_ratio : float, optional
-        Ratio for cropping borders before performing scene detection.
-    Returns
-    -------
-    Any
-        Response returned by the remote /scenes_extraction endpoint.
-    """
-    result = svision_client.predict(
-        video_file={"video": handle_file(video_path)},
-        threshold=threshold,
-        offset_frames=offset_frames,
-        crop_ratio=crop_ratio,
-        api_name="/scenes_extraction"
-    )
-    return result
-def keyframes_every_second_extraction(video_path: str):
-    """
-    Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.
-    Parameters
-    ----------
-    video_path : str
-        Path to the input video file.
-    Returns
-    -------
-    Any
-        Response returned by the remote /keyframes_every_second_extraction endpoint.
-    """
-    result = svision_client.predict(
-        video_path={"video": handle_file(video_path)},
-        api_name="/keyframes_every_second_extraction"
-    )
-    return result
-def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
-    """
-    Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.
-    This function sends an image together with metadata and face collection data
-    to perform OCR, face detection, and annotation enhancement.
-    Parameters
-    ----------
-    imagen_path : str
-        Path to the input image file.
-    informacion_image : Dict[str, Any]
-        Dictionary containing image-related metadata.
-    face_col : List[Dict[str, Any]]
-        List of dictionaries representing detected faces or face metadata.
-    Returns
-    -------
-    Dict[str, Any]
-        Processed output containing OCR results, face detection data, and annotations.
-    """
-    informacion_image_str = json.dumps(informacion_image)
-    face_col_str = json.dumps(face_col)
-    result = svision_client.predict(
-            image=handle_file(imagen_path),
-            informacion_image=informacion_image_str,
-            face_col=face_col_str,
-            api_name="/add_ocr_and_faces"
-    )
-    return result
-def extract_descripcion_escena(imagen_path: str) -> str:
-    """
-    Call the /describe_images endpoint of the remote Space VeuReu/svision.
-    This function sends an image to receive a textual description of its visual content.
-    Parameters
-    ----------
-    imagen_path : str
-        Path to the input image file.
-    Returns
-    -------
-    str
-        Description generated for the given image.
-    """
-    print("Calling svision to describe the scene...")
-    result = svision_client.predict(
-        images=[{"image": handle_file(imagen_path)}],
-        api_name="/describe_images"
-    )
-    return result

+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+from gradio_client import Client, handle_file
+from typing import Any, Dict, List, Optional, Tuple, Union
+import json
+# Lazy initialization to avoid crash if Space is down at import time
+_svision_client = None
+def _get_svision_client():
+    """Get or create the svision client (lazy initialization)."""
+    global _svision_client
+    if _svision_client is None:
+        _svision_client = Client("VeuReu/svision")
+    return _svision_client
+def extract_scenes(video_path: str, threshold: float = 30.0, offset_frames: int = 5, crop_ratio: float = 0.1):
+    """
+    Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.
+    Parameters
+    ----------
+    video_path : str
+        Path to the input video file.
+    threshold : float, optional
+        Scene change detection threshold; higher values make detection less sensitive.
+    offset_frames : int, optional
+        Number of frames to include before and after a detected scene boundary.
+    crop_ratio : float, optional
+        Ratio for cropping borders before performing scene detection.
+    Returns
+    -------
+    Any
+        Response returned by the remote /scenes_extraction endpoint.
+    """
+    result = _get_svision_client().predict(
+        video_file={"video": handle_file(video_path)},
+        threshold=threshold,
+        offset_frames=offset_frames,
+        crop_ratio=crop_ratio,
+        api_name="/scenes_extraction"
+    )
+    return result
+def keyframes_every_second_extraction(video_path: str):
+    """
+    Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.
+    Parameters
+    ----------
+    video_path : str
+        Path to the input video file.
+    Returns
+    -------
+    Any
+        Response returned by the remote /keyframes_every_second_extraction endpoint.
+    """
+    result = _get_svision_client().predict(
+        video_path={"video": handle_file(video_path)},
+        api_name="/keyframes_every_second_extraction"
+    )
+    return result
+def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.
+    This function sends an image together with metadata and face collection data
+    to perform OCR, face detection, and annotation enhancement.
+    Parameters
+    ----------
+    imagen_path : str
+        Path to the input image file.
+    informacion_image : Dict[str, Any]
+        Dictionary containing image-related metadata.
+    face_col : List[Dict[str, Any]]
+        List of dictionaries representing detected faces or face metadata.
+    Returns
+    -------
+    Dict[str, Any]
+        Processed output containing OCR results, face detection data, and annotations.
+    """
+    informacion_image_str = json.dumps(informacion_image)
+    face_col_str = json.dumps(face_col)
+    result = _get_svision_client().predict(
+            image=handle_file(imagen_path),
+            informacion_image=informacion_image_str,
+            face_col=face_col_str,
+            api_name="/add_ocr_and_faces"
+    )
+    return result
+def extract_descripcion_escena(imagen_path: str) -> str:
+    """
+    Call the /describe_images endpoint of the remote Space VeuReu/svision.
+    This function sends an image to receive a textual description of its visual content.
+    Parameters
+    ----------
+    imagen_path : str
+        Path to the input image file.
+    Returns
+    -------
+    str
+        Description generated for the given image.
+    """
+    result = _get_svision_client().predict(
+        images=[{"image": handle_file(imagen_path)}],
+        api_name="/describe_images"
+    )
+    return result