Spaces:

VeuReu
/

engine

Running

App Files Files Community

engine / asr_client.py

VeuReu

Update asr_client.py

413fec6 verified 13 days ago

raw

history blame contribute delete

5.71 kB

	import os
	os.environ["CUDA_VISIBLE_DEVICES"] = "1"

	from gradio_client import Client, handle_file
	from typing import Any, Dict, List
	from PIL import Image
	import json

	# Lazy initialization to avoid crash if Space is down at import time
	_asr_client = None


	def _get_asr_client():
	"""Get or create the ASR client (lazy initialization)."""
	global _asr_client
	if _asr_client is None:
	_asr_client = Client("VeuReu/asr")
	return _asr_client


	def extract_audio_from_video(video_path: str) -> str:
	"""
	Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.

	This function uploads a video file to the remote ASR service and extracts its audio track.

	Parameters
	----------
	video_path : str
	Path to the input video file from which audio will be extracted.

	Returns
	-------
	str
	Path or identifier of the extracted audio file returned by the remote service.
	"""
	result = _get_asr_client().predict(
	video_file={"video": handle_file(video_path)},
	api_name="/extract_audio_ffmpeg"
	)
	return result


	def diarize_audio(audio_path: str) -> str:
	"""
	Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.

	This function performs speaker diarization, identifying segments of speech
	belonging to different speakers in the audio file.

	Parameters
	----------
	audio_path : str
	Path to the audio file to be diarized.

	Returns
	-------
	str
	JSON-like diarization output containing speaker segments and timings.
	"""
	result = _get_asr_client().predict(
	wav_file=handle_file(audio_path),
	api_name="/diaritzar_audio"
	)
	return result


	def transcribe_long_audio(audio_path: str) -> str:
	"""
	Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.

	Designed for long audio recordings, this function sends the audio to the ASR model
	optimized for processing extended durations.

	Parameters
	----------
	audio_path : str
	Path to the long audio file to be transcribed.

	Returns
	-------
	str
	Transcribed text returned by the remote ASR service.
	"""
	result = _get_asr_client().predict(
	wav_path=handle_file(audio_path),
	api_name="/transcribe_long_audio"
	)
	return result


	def transcribe_short_audio(audio_path: str) -> str:
	"""
	Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.

	This function is optimized for short-duration audio samples and produces fast transcriptions.

	Parameters
	----------
	audio_path : str
	Path to the short audio file to be transcribed.

	Returns
	-------
	str
	Transcribed text returned by the remote service.
	"""
	result = _get_asr_client().predict(
	wav_path=handle_file(audio_path),
	api_name="/transcribe_wav"
	)
	return result


	def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
	"""
	Call the /identificar_veu endpoint of the remote VeuReu/asr Space.

	This function attempts to identify which known speaker (from a provided
	collection of voice profiles) appears in the given audio clip.

	Parameters
	----------
	clip_path : str
	Path to the audio clip whose speaker is to be identified.
	voice_col : List[Dict[str, Any]]
	List of dictionaries containing metadata or embeddings for known voices.

	Returns
	-------
	Any
	Output returned by the remote speaker identification model.
	"""
	voice_col_str = json.dumps(voice_col)
	result = _get_asr_client().predict(
	wav_file=handle_file(clip_path),
	voice_col=voice_col_str,
	api_name="/identificar_veu"
	)
	return result


	def get_voice_embedding(audio_path: str) -> List[float]:
	"""
	Call the /voice_embedding endpoint to get a voice embedding vector.

	This replaces local SpeakerRecognition processing by delegating to asr Space.

	Parameters
	----------
	audio_path : str
	Path to the audio file (WAV format preferred).

	Returns
	-------
	List[float]
	Normalized embedding vector for the voice, or empty list on error.
	"""
	try:
	result = _get_asr_client().predict(
	wav_file=handle_file(audio_path),
	api_name="/voice_embedding"
	)
	return result if result else []
	except Exception as e:
	print(f"[asr_client] get_voice_embedding error: {e}")
	return []


	def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
	"""
	Extract audio from video and perform diarization in one call.

	Parameters
	----------
	video_path : str
	Path to the input video file.

	Returns
	-------
	Dict[str, Any]
	Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
	"""
	try:
	# First extract audio
	audio_path = extract_audio_from_video(video_path)
	if not audio_path:
	return {"clips": [], "segments": [], "error": "Audio extraction failed"}

	# Then diarize
	result = diarize_audio(audio_path)
	# result is tuple: (clips_paths, segments)
	if result and len(result) >= 2:
	return {
	"clips": result[0] if result[0] else [],
	"segments": result[1] if result[1] else [],
	"audio_path": audio_path,
	}
	return {"clips": [], "segments": [], "audio_path": audio_path}
	except Exception as e:
	print(f"[asr_client] extract_audio_and_diarize error: {e}")
	return {"clips": [], "segments": [], "error": str(e)}