File size: 8,638 Bytes
391b4d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 |
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from gradio_client import Client, handle_file
from typing import Any, Dict, List, Optional, Tuple, Union
import requests
import json
# Lazy initialization to avoid crash if Space is down at import time
_svision_client = None
def _get_svision_client():
"""Get or create the svision client (lazy initialization)."""
global _svision_client
if _svision_client is None:
_svision_client = Client("VeuReu/svision")
return _svision_client
def extract_scenes(video_path: str, threshold: float = 240, offset_frames: int = 5, crop_ratio: float = 0.1):
"""
Call the /scenes_extraction endpoint of the remote Space VeuReu/svision.
Parameters
----------
video_path : str
Path to the input video file.
threshold : float, optional
Scene change detection threshold; higher values make detection less sensitive.
offset_frames : int, optional
Number of frames to include before and after a detected scene boundary.
crop_ratio : float, optional
Ratio for cropping borders before performing scene detection.
Returns
-------
Any
Response returned by the remote /scenes_extraction endpoint.
"""
result = _get_svision_client().predict(
video_file={"video": handle_file(video_path)},
threshold=threshold,
offset_frames=offset_frames,
crop_ratio=crop_ratio,
api_name="/scenes_extraction"
)
return result
def keyframes_every_second_extraction(video_path: str):
"""
Call the /keyframes_every_second_extraction endpoint of the remote Space VeuReu/svision.
Parameters
----------
video_path : str
Path to the input video file.
Returns
-------
Any
Response returned by the remote /keyframes_every_second_extraction endpoint.
"""
result = _get_svision_client().predict(
video_path={"video": handle_file(video_path)},
api_name="/keyframes_every_second_extraction"
)
return result
def add_ocr_and_faces(imagen_path: str, informacion_image: Dict[str, Any], face_col: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Call the /add_ocr_and_faces endpoint of the remote Space VeuReu/svision.
This function sends an image together with metadata and face collection data
to perform OCR, face detection, and annotation enhancement.
Parameters
----------
imagen_path : str
Path to the input image file.
informacion_image : Dict[str, Any]
Dictionary containing image-related metadata.
face_col : List[Dict[str, Any]]
List of dictionaries representing detected faces or face metadata.
Returns
-------
Dict[str, Any]
Processed output containing OCR results, face detection data, and annotations.
"""
informacion_image_str = json.dumps(informacion_image)
face_col_str = json.dumps(face_col)
result = _get_svision_client().predict(
image=handle_file(imagen_path),
informacion_image=informacion_image_str,
face_col=face_col_str,
api_name="/add_ocr_and_faces"
)
return result
def extract_descripcion_escena(imagen_path: str) -> str:
"""
Call the /describe_images endpoint of the remote Space VeuReu/svision.
This function sends an image to receive a textual description of its visual content.
Parameters
----------
imagen_path : str
Path to the input image file.
Returns
-------
str
Description generated for the given image.
"""
result = _get_svision_client().predict(
images=[{"image": handle_file(imagen_path)}],
api_name="/describe_images"
)
return result
def _extract_path_from_gradio_file(file_obj) -> Optional[str]:
"""Extract file path from Gradio file object (can be dict, str, tuple, or other).
Gradio Gallery returns different formats depending on version:
- List of tuples: [(path, caption), ...]
- List of dicts: [{"name": path, "data": None, "is_file": True}, ...]
- List of FileData: [FileData(path=..., url=...), ...]
- List of paths: [path, ...]
"""
if file_obj is None:
return None
# Handle tuple format: (path, caption)
if isinstance(file_obj, tuple) and len(file_obj) >= 1:
return _extract_path_from_gradio_file(file_obj[0])
# Handle string path/URL
if isinstance(file_obj, str):
return file_obj
# Handle dict format: {"path": "...", "url": "...", "name": "..."}
if isinstance(file_obj, dict):
return file_obj.get("path") or file_obj.get("url") or file_obj.get("name") or file_obj.get("image")
# Handle FileData or similar object with attributes
if hasattr(file_obj, "path") and file_obj.path:
return file_obj.path
if hasattr(file_obj, "url") and file_obj.url:
return file_obj.url
if hasattr(file_obj, "name") and file_obj.name:
return file_obj.name
# Last resort: convert to string
return str(file_obj) if file_obj else None
def get_face_embeddings_from_image(image_path: str) -> List[Dict[str, Any]]:
"""
Call the /face_image_embedding_casting endpoint to detect faces and get embeddings.
This replaces local DeepFace/face_recognition processing by delegating to svision Space.
Parameters
----------
image_path : str
Path to the input image file (a video frame).
Returns
-------
List[Dict[str, Any]]
List of dicts with 'embedding' (list of floats) and 'face_crop_path' (image path string).
Returns empty list if no faces detected or on error.
"""
try:
# Returns: (face_crops: list of images/dicts, face_embeddings: list of dicts)
result = _get_svision_client().predict(
image=handle_file(image_path),
api_name="/face_image_embedding_casting"
)
print(f"[svision_client] Raw result type: {type(result)}, len: {len(result) if result else 0}")
# result is a tuple: (list of image paths/dicts, list of embedding dicts)
if result and len(result) >= 2:
face_crops_raw = result[0] if result[0] else []
face_embeddings = result[1] if result[1] else []
print(f"[svision_client] face_crops_raw type: {type(face_crops_raw)}, len: {len(face_crops_raw) if isinstance(face_crops_raw, list) else 'N/A'}")
if face_crops_raw and len(face_crops_raw) > 0:
print(f"[svision_client] First crop type: {type(face_crops_raw[0])}, value: {str(face_crops_raw[0])[:200]}")
# Combine into unified structure, extracting paths correctly
faces = []
for i, emb_dict in enumerate(face_embeddings):
# Extract path from Gradio file object (might be dict or string)
crop_path = None
if i < len(face_crops_raw):
raw_crop = face_crops_raw[i]
crop_path = _extract_path_from_gradio_file(raw_crop)
if not crop_path:
print(f"[svision_client] Could not extract path from crop {i}: {type(raw_crop)} = {str(raw_crop)[:100]}")
embedding = emb_dict.get("embedding", []) if isinstance(emb_dict, dict) else []
faces.append({
"embedding": embedding,
"face_crop_path": crop_path,
"index": emb_dict.get("index", i) if isinstance(emb_dict, dict) else i,
})
print(f"[svision_client] Detected {len(faces)} faces from image")
return faces
return []
except Exception as e:
print(f"[svision_client] get_face_embeddings_from_image error: {e}")
import traceback
traceback.print_exc()
return []
def get_face_embeddings_simple(image_path: str) -> List[List[float]]:
"""
Call the /face_image_embedding endpoint to get face embeddings only.
Parameters
----------
image_path : str
Path to the input image file.
Returns
-------
List[List[float]]
List of embedding vectors (one per detected face).
"""
try:
result = _get_svision_client().predict(
image=handle_file(image_path),
api_name="/face_image_embedding"
)
return result if result else []
except Exception as e:
print(f"[svision_client] get_face_embeddings_simple error: {e}")
return []
|