Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on 11 days ago

Commit

413fec6

verified ·

1 Parent(s): ac342d0

Update asr_client.py

Browse files

Files changed (1) hide show

asr_client.py +202 -202

asr_client.py CHANGED Viewed

@@ -1,202 +1,202 @@
-import os
-os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-from gradio_client import Client, handle_file
-from typing import Any, Dict, List
-from PIL import Image
-import json
-# Lazy initialization to avoid crash if Space is down at import time
-_asr_client = None
-def _get_asr_client():
-    """Get or create the ASR client (lazy initialization)."""
-    global _asr_client
-    if _asr_client is None:
-        _asr_client = Client("VeuReu/asr")
-    return _asr_client
-def extract_audio_from_video(video_path: str) -> str:
-    """
-    Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
-    This function uploads a video file to the remote ASR service and extracts its audio track.
-    Parameters
-    ----------
-    video_path : str
-        Path to the input video file from which audio will be extracted.
-    Returns
-    -------
-    str
-        Path or identifier of the extracted audio file returned by the remote service.
-    """
-    result = _get_asr_client().predict(
-        video_file={"video": handle_file(video_path)},
-        api_name="/extract_audio_ffmpeg"
-    )
-    return result
-def diarize_audio(audio_path: str) -> str:
-    """
-    Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
-    This function performs speaker diarization, identifying segments of speech
-    belonging to different speakers in the audio file.
-    Parameters
-    ----------
-    audio_path : str
-        Path to the audio file to be diarized.
-    Returns
-    -------
-    str
-        JSON-like diarization output containing speaker segments and timings.
-    """
-    result = _get_asr_client().predict(
-        wav_archivo=handle_file(audio_path),
-        api_name="/diaritzar_audio"
-    )
-    return result
-def transcribe_long_audio(audio_path: str) -> str:
-    """
-    Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
-    Designed for long audio recordings, this function sends the audio to the ASR model
-    optimized for processing extended durations.
-    Parameters
-    ----------
-    audio_path : str
-        Path to the long audio file to be transcribed.
-    Returns
-    -------
-    str
-        Transcribed text returned by the remote ASR service.
-    """
-    result = _get_asr_client().predict(
-        wav_path=handle_file(audio_path),
-        api_name="/transcribe_long_audio"
-    )
-    return result
-def transcribe_short_audio(audio_path: str) -> str:
-    """
-    Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
-    This function is optimized for short-duration audio samples and produces fast transcriptions.
-    Parameters
-    ----------
-    audio_path : str
-        Path to the short audio file to be transcribed.
-    Returns
-    -------
-    str
-        Transcribed text returned by the remote service.
-    """
-    result = _get_asr_client().predict(
-        wav_path=handle_file(audio_path),
-        api_name="/transcribe_wav"
-    )
-    return result
-def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
-    """
-    Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
-    This function attempts to identify which known speaker (from a provided
-    collection of voice profiles) appears in the given audio clip.
-    Parameters
-    ----------
-    clip_path : str
-        Path to the audio clip whose speaker is to be identified.
-    voice_col : List[Dict[str, Any]]
-        List of dictionaries containing metadata or embeddings for known voices.
-    Returns
-    -------
-    Any
-        Output returned by the remote speaker identification model.
-    """
-    voice_col_str = json.dumps(voice_col)
-    result = _get_asr_client().predict(
-        wav_archivo=handle_file(clip_path),
-        voice_col=voice_col_str,
-        api_name="/identificar_veu"
-    )
-    return result
-def get_voice_embedding(audio_path: str) -> List[float]:
-    """
-    Call the /voice_embedding endpoint to get a voice embedding vector.
-    This replaces local SpeakerRecognition processing by delegating to asr Space.
-    Parameters
-    ----------
-    audio_path : str
-        Path to the audio file (WAV format preferred).
-    Returns
-    -------
-    List[float]
-        Normalized embedding vector for the voice, or empty list on error.
-    """
-    try:
-        result = _get_asr_client().predict(
-            wav_archivo=handle_file(audio_path),
-            api_name="/voice_embedding"
-        )
-        return result if result else []
-    except Exception as e:
-        print(f"[asr_client] get_voice_embedding error: {e}")
-        return []
-def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
-    """
-    Extract audio from video and perform diarization in one call.
-    Parameters
-    ----------
-    video_path : str
-        Path to the input video file.
-    Returns
-    -------
-    Dict[str, Any]
-        Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
-    """
-    try:
-        # First extract audio
-        audio_path = extract_audio_from_video(video_path)
-        if not audio_path:
-            return {"clips": [], "segments": [], "error": "Audio extraction failed"}
-        # Then diarize
-        result = diarize_audio(audio_path)
-        # result is tuple: (clips_paths, segments)
-        if result and len(result) >= 2:
-            return {
-                "clips": result[0] if result[0] else [],
-                "segments": result[1] if result[1] else [],
-                "audio_path": audio_path,
-            }
-        return {"clips": [], "segments": [], "audio_path": audio_path}
-    except Exception as e:
-        print(f"[asr_client] extract_audio_and_diarize error: {e}")
-        return {"clips": [], "segments": [], "error": str(e)}

+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+from gradio_client import Client, handle_file
+from typing import Any, Dict, List
+from PIL import Image
+import json
+# Lazy initialization to avoid crash if Space is down at import time
+_asr_client = None
+def _get_asr_client():
+    """Get or create the ASR client (lazy initialization)."""
+    global _asr_client
+    if _asr_client is None:
+        _asr_client = Client("VeuReu/asr")
+    return _asr_client
+def extract_audio_from_video(video_path: str) -> str:
+    """
+    Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
+    This function uploads a video file to the remote ASR service and extracts its audio track.
+    Parameters
+    ----------
+    video_path : str
+        Path to the input video file from which audio will be extracted.
+    Returns
+    -------
+    str
+        Path or identifier of the extracted audio file returned by the remote service.
+    """
+    result = _get_asr_client().predict(
+        video_file={"video": handle_file(video_path)},
+        api_name="/extract_audio_ffmpeg"
+    )
+    return result
+def diarize_audio(audio_path: str) -> str:
+    """
+    Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
+    This function performs speaker diarization, identifying segments of speech
+    belonging to different speakers in the audio file.
+    Parameters
+    ----------
+    audio_path : str
+        Path to the audio file to be diarized.
+    Returns
+    -------
+    str
+        JSON-like diarization output containing speaker segments and timings.
+    """
+    result = _get_asr_client().predict(
+        wav_file=handle_file(audio_path),
+        api_name="/diaritzar_audio"
+    )
+    return result
+def transcribe_long_audio(audio_path: str) -> str:
+    """
+    Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
+    Designed for long audio recordings, this function sends the audio to the ASR model
+    optimized for processing extended durations.
+    Parameters
+    ----------
+    audio_path : str
+        Path to the long audio file to be transcribed.
+    Returns
+    -------
+    str
+        Transcribed text returned by the remote ASR service.
+    """
+    result = _get_asr_client().predict(
+        wav_path=handle_file(audio_path),
+        api_name="/transcribe_long_audio"
+    )
+    return result
+def transcribe_short_audio(audio_path: str) -> str:
+    """
+    Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
+    This function is optimized for short-duration audio samples and produces fast transcriptions.
+    Parameters
+    ----------
+    audio_path : str
+        Path to the short audio file to be transcribed.
+    Returns
+    -------
+    str
+        Transcribed text returned by the remote service.
+    """
+    result = _get_asr_client().predict(
+        wav_path=handle_file(audio_path),
+        api_name="/transcribe_wav"
+    )
+    return result
+def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
+    """
+    Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
+    This function attempts to identify which known speaker (from a provided
+    collection of voice profiles) appears in the given audio clip.
+    Parameters
+    ----------
+    clip_path : str
+        Path to the audio clip whose speaker is to be identified.
+    voice_col : List[Dict[str, Any]]
+        List of dictionaries containing metadata or embeddings for known voices.
+    Returns
+    -------
+    Any
+        Output returned by the remote speaker identification model.
+    """
+    voice_col_str = json.dumps(voice_col)
+    result = _get_asr_client().predict(
+        wav_file=handle_file(clip_path),
+        voice_col=voice_col_str,
+        api_name="/identificar_veu"
+    )
+    return result
+def get_voice_embedding(audio_path: str) -> List[float]:
+    """
+    Call the /voice_embedding endpoint to get a voice embedding vector.
+    This replaces local SpeakerRecognition processing by delegating to asr Space.
+    Parameters
+    ----------
+    audio_path : str
+        Path to the audio file (WAV format preferred).
+    Returns
+    -------
+    List[float]
+        Normalized embedding vector for the voice, or empty list on error.
+    """
+    try:
+        result = _get_asr_client().predict(
+            wav_file=handle_file(audio_path),
+            api_name="/voice_embedding"
+        )
+        return result if result else []
+    except Exception as e:
+        print(f"[asr_client] get_voice_embedding error: {e}")
+        return []
+def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
+    """
+    Extract audio from video and perform diarization in one call.
+    Parameters
+    ----------
+    video_path : str
+        Path to the input video file.
+    Returns
+    -------
+    Dict[str, Any]
+        Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
+    """
+    try:
+        # First extract audio
+        audio_path = extract_audio_from_video(video_path)
+        if not audio_path:
+            return {"clips": [], "segments": [], "error": "Audio extraction failed"}
+        # Then diarize
+        result = diarize_audio(audio_path)
+        # result is tuple: (clips_paths, segments)
+        if result and len(result) >= 2:
+            return {
+                "clips": result[0] if result[0] else [],
+                "segments": result[1] if result[1] else [],
+                "audio_path": audio_path,
+            }
+        return {"clips": [], "segments": [], "audio_path": audio_path}
+    except Exception as e:
+        print(f"[asr_client] extract_audio_and_diarize error: {e}")
+        return {"clips": [], "segments": [], "error": str(e)}