|
|
import os |
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
|
|
|
|
|
from gradio_client import Client, handle_file |
|
|
from typing import Any, Dict, List |
|
|
from PIL import Image |
|
|
import json |
|
|
|
|
|
|
|
|
_asr_client = None |
|
|
|
|
|
|
|
|
def _get_asr_client(): |
|
|
"""Get or create the ASR client (lazy initialization).""" |
|
|
global _asr_client |
|
|
if _asr_client is None: |
|
|
_asr_client = Client("VeuReu/asr") |
|
|
return _asr_client |
|
|
|
|
|
|
|
|
def extract_audio_from_video(video_path: str) -> str: |
|
|
""" |
|
|
Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
This function uploads a video file to the remote ASR service and extracts its audio track. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
video_path : str |
|
|
Path to the input video file from which audio will be extracted. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
str |
|
|
Path or identifier of the extracted audio file returned by the remote service. |
|
|
""" |
|
|
result = _get_asr_client().predict( |
|
|
video_file={"video": handle_file(video_path)}, |
|
|
api_name="/extract_audio_ffmpeg" |
|
|
) |
|
|
return result |
|
|
|
|
|
|
|
|
def diarize_audio(audio_path: str) -> str: |
|
|
""" |
|
|
Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
This function performs speaker diarization, identifying segments of speech |
|
|
belonging to different speakers in the audio file. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
audio_path : str |
|
|
Path to the audio file to be diarized. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
str |
|
|
JSON-like diarization output containing speaker segments and timings. |
|
|
""" |
|
|
result = _get_asr_client().predict( |
|
|
wav_file=handle_file(audio_path), |
|
|
api_name="/diaritzar_audio" |
|
|
) |
|
|
return result |
|
|
|
|
|
|
|
|
def transcribe_long_audio(audio_path: str) -> str: |
|
|
""" |
|
|
Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
Designed for long audio recordings, this function sends the audio to the ASR model |
|
|
optimized for processing extended durations. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
audio_path : str |
|
|
Path to the long audio file to be transcribed. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
str |
|
|
Transcribed text returned by the remote ASR service. |
|
|
""" |
|
|
result = _get_asr_client().predict( |
|
|
wav_path=handle_file(audio_path), |
|
|
api_name="/transcribe_long_audio" |
|
|
) |
|
|
return result |
|
|
|
|
|
|
|
|
def transcribe_short_audio(audio_path: str) -> str: |
|
|
""" |
|
|
Call the /transcribe_wav endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
This function is optimized for short-duration audio samples and produces fast transcriptions. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
audio_path : str |
|
|
Path to the short audio file to be transcribed. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
str |
|
|
Transcribed text returned by the remote service. |
|
|
""" |
|
|
result = _get_asr_client().predict( |
|
|
wav_path=handle_file(audio_path), |
|
|
api_name="/transcribe_wav" |
|
|
) |
|
|
return result |
|
|
|
|
|
|
|
|
def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]): |
|
|
""" |
|
|
Call the /identificar_veu endpoint of the remote VeuReu/asr Space. |
|
|
|
|
|
This function attempts to identify which known speaker (from a provided |
|
|
collection of voice profiles) appears in the given audio clip. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
clip_path : str |
|
|
Path to the audio clip whose speaker is to be identified. |
|
|
voice_col : List[Dict[str, Any]] |
|
|
List of dictionaries containing metadata or embeddings for known voices. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Any |
|
|
Output returned by the remote speaker identification model. |
|
|
""" |
|
|
voice_col_str = json.dumps(voice_col) |
|
|
result = _get_asr_client().predict( |
|
|
wav_file=handle_file(clip_path), |
|
|
voice_col=voice_col_str, |
|
|
api_name="/identificar_veu" |
|
|
) |
|
|
return result |
|
|
|
|
|
|
|
|
def get_voice_embedding(audio_path: str) -> List[float]: |
|
|
""" |
|
|
Call the /voice_embedding endpoint to get a voice embedding vector. |
|
|
|
|
|
This replaces local SpeakerRecognition processing by delegating to asr Space. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
audio_path : str |
|
|
Path to the audio file (WAV format preferred). |
|
|
|
|
|
Returns |
|
|
------- |
|
|
List[float] |
|
|
Normalized embedding vector for the voice, or empty list on error. |
|
|
""" |
|
|
try: |
|
|
result = _get_asr_client().predict( |
|
|
wav_file=handle_file(audio_path), |
|
|
api_name="/voice_embedding" |
|
|
) |
|
|
return result if result else [] |
|
|
except Exception as e: |
|
|
print(f"[asr_client] get_voice_embedding error: {e}") |
|
|
return [] |
|
|
|
|
|
|
|
|
def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]: |
|
|
""" |
|
|
Extract audio from video and perform diarization in one call. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
video_path : str |
|
|
Path to the input video file. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Dict[str, Any] |
|
|
Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info). |
|
|
""" |
|
|
try: |
|
|
|
|
|
audio_path = extract_audio_from_video(video_path) |
|
|
if not audio_path: |
|
|
return {"clips": [], "segments": [], "error": "Audio extraction failed"} |
|
|
|
|
|
|
|
|
result = diarize_audio(audio_path) |
|
|
|
|
|
if result and len(result) >= 2: |
|
|
return { |
|
|
"clips": result[0] if result[0] else [], |
|
|
"segments": result[1] if result[1] else [], |
|
|
"audio_path": audio_path, |
|
|
} |
|
|
return {"clips": [], "segments": [], "audio_path": audio_path} |
|
|
except Exception as e: |
|
|
print(f"[asr_client] extract_audio_and_diarize error: {e}") |
|
|
return {"clips": [], "segments": [], "error": str(e)} |
|
|
|