VeuReu commited on
Commit
413fec6
·
verified ·
1 Parent(s): ac342d0

Update asr_client.py

Browse files
Files changed (1) hide show
  1. asr_client.py +202 -202
asr_client.py CHANGED
@@ -1,202 +1,202 @@
1
- import os
2
- os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3
-
4
- from gradio_client import Client, handle_file
5
- from typing import Any, Dict, List
6
- from PIL import Image
7
- import json
8
-
9
- # Lazy initialization to avoid crash if Space is down at import time
10
- _asr_client = None
11
-
12
-
13
- def _get_asr_client():
14
- """Get or create the ASR client (lazy initialization)."""
15
- global _asr_client
16
- if _asr_client is None:
17
- _asr_client = Client("VeuReu/asr")
18
- return _asr_client
19
-
20
-
21
- def extract_audio_from_video(video_path: str) -> str:
22
- """
23
- Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
24
-
25
- This function uploads a video file to the remote ASR service and extracts its audio track.
26
-
27
- Parameters
28
- ----------
29
- video_path : str
30
- Path to the input video file from which audio will be extracted.
31
-
32
- Returns
33
- -------
34
- str
35
- Path or identifier of the extracted audio file returned by the remote service.
36
- """
37
- result = _get_asr_client().predict(
38
- video_file={"video": handle_file(video_path)},
39
- api_name="/extract_audio_ffmpeg"
40
- )
41
- return result
42
-
43
-
44
- def diarize_audio(audio_path: str) -> str:
45
- """
46
- Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
47
-
48
- This function performs speaker diarization, identifying segments of speech
49
- belonging to different speakers in the audio file.
50
-
51
- Parameters
52
- ----------
53
- audio_path : str
54
- Path to the audio file to be diarized.
55
-
56
- Returns
57
- -------
58
- str
59
- JSON-like diarization output containing speaker segments and timings.
60
- """
61
- result = _get_asr_client().predict(
62
- wav_archivo=handle_file(audio_path),
63
- api_name="/diaritzar_audio"
64
- )
65
- return result
66
-
67
-
68
- def transcribe_long_audio(audio_path: str) -> str:
69
- """
70
- Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
71
-
72
- Designed for long audio recordings, this function sends the audio to the ASR model
73
- optimized for processing extended durations.
74
-
75
- Parameters
76
- ----------
77
- audio_path : str
78
- Path to the long audio file to be transcribed.
79
-
80
- Returns
81
- -------
82
- str
83
- Transcribed text returned by the remote ASR service.
84
- """
85
- result = _get_asr_client().predict(
86
- wav_path=handle_file(audio_path),
87
- api_name="/transcribe_long_audio"
88
- )
89
- return result
90
-
91
-
92
- def transcribe_short_audio(audio_path: str) -> str:
93
- """
94
- Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
95
-
96
- This function is optimized for short-duration audio samples and produces fast transcriptions.
97
-
98
- Parameters
99
- ----------
100
- audio_path : str
101
- Path to the short audio file to be transcribed.
102
-
103
- Returns
104
- -------
105
- str
106
- Transcribed text returned by the remote service.
107
- """
108
- result = _get_asr_client().predict(
109
- wav_path=handle_file(audio_path),
110
- api_name="/transcribe_wav"
111
- )
112
- return result
113
-
114
-
115
- def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
116
- """
117
- Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
118
-
119
- This function attempts to identify which known speaker (from a provided
120
- collection of voice profiles) appears in the given audio clip.
121
-
122
- Parameters
123
- ----------
124
- clip_path : str
125
- Path to the audio clip whose speaker is to be identified.
126
- voice_col : List[Dict[str, Any]]
127
- List of dictionaries containing metadata or embeddings for known voices.
128
-
129
- Returns
130
- -------
131
- Any
132
- Output returned by the remote speaker identification model.
133
- """
134
- voice_col_str = json.dumps(voice_col)
135
- result = _get_asr_client().predict(
136
- wav_archivo=handle_file(clip_path),
137
- voice_col=voice_col_str,
138
- api_name="/identificar_veu"
139
- )
140
- return result
141
-
142
-
143
- def get_voice_embedding(audio_path: str) -> List[float]:
144
- """
145
- Call the /voice_embedding endpoint to get a voice embedding vector.
146
-
147
- This replaces local SpeakerRecognition processing by delegating to asr Space.
148
-
149
- Parameters
150
- ----------
151
- audio_path : str
152
- Path to the audio file (WAV format preferred).
153
-
154
- Returns
155
- -------
156
- List[float]
157
- Normalized embedding vector for the voice, or empty list on error.
158
- """
159
- try:
160
- result = _get_asr_client().predict(
161
- wav_archivo=handle_file(audio_path),
162
- api_name="/voice_embedding"
163
- )
164
- return result if result else []
165
- except Exception as e:
166
- print(f"[asr_client] get_voice_embedding error: {e}")
167
- return []
168
-
169
-
170
- def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
171
- """
172
- Extract audio from video and perform diarization in one call.
173
-
174
- Parameters
175
- ----------
176
- video_path : str
177
- Path to the input video file.
178
-
179
- Returns
180
- -------
181
- Dict[str, Any]
182
- Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
183
- """
184
- try:
185
- # First extract audio
186
- audio_path = extract_audio_from_video(video_path)
187
- if not audio_path:
188
- return {"clips": [], "segments": [], "error": "Audio extraction failed"}
189
-
190
- # Then diarize
191
- result = diarize_audio(audio_path)
192
- # result is tuple: (clips_paths, segments)
193
- if result and len(result) >= 2:
194
- return {
195
- "clips": result[0] if result[0] else [],
196
- "segments": result[1] if result[1] else [],
197
- "audio_path": audio_path,
198
- }
199
- return {"clips": [], "segments": [], "audio_path": audio_path}
200
- except Exception as e:
201
- print(f"[asr_client] extract_audio_and_diarize error: {e}")
202
- return {"clips": [], "segments": [], "error": str(e)}
 
1
+ import os
2
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3
+
4
+ from gradio_client import Client, handle_file
5
+ from typing import Any, Dict, List
6
+ from PIL import Image
7
+ import json
8
+
9
+ # Lazy initialization to avoid crash if Space is down at import time
10
+ _asr_client = None
11
+
12
+
13
+ def _get_asr_client():
14
+ """Get or create the ASR client (lazy initialization)."""
15
+ global _asr_client
16
+ if _asr_client is None:
17
+ _asr_client = Client("VeuReu/asr")
18
+ return _asr_client
19
+
20
+
21
+ def extract_audio_from_video(video_path: str) -> str:
22
+ """
23
+ Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
24
+
25
+ This function uploads a video file to the remote ASR service and extracts its audio track.
26
+
27
+ Parameters
28
+ ----------
29
+ video_path : str
30
+ Path to the input video file from which audio will be extracted.
31
+
32
+ Returns
33
+ -------
34
+ str
35
+ Path or identifier of the extracted audio file returned by the remote service.
36
+ """
37
+ result = _get_asr_client().predict(
38
+ video_file={"video": handle_file(video_path)},
39
+ api_name="/extract_audio_ffmpeg"
40
+ )
41
+ return result
42
+
43
+
44
+ def diarize_audio(audio_path: str) -> str:
45
+ """
46
+ Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
47
+
48
+ This function performs speaker diarization, identifying segments of speech
49
+ belonging to different speakers in the audio file.
50
+
51
+ Parameters
52
+ ----------
53
+ audio_path : str
54
+ Path to the audio file to be diarized.
55
+
56
+ Returns
57
+ -------
58
+ str
59
+ JSON-like diarization output containing speaker segments and timings.
60
+ """
61
+ result = _get_asr_client().predict(
62
+ wav_file=handle_file(audio_path),
63
+ api_name="/diaritzar_audio"
64
+ )
65
+ return result
66
+
67
+
68
+ def transcribe_long_audio(audio_path: str) -> str:
69
+ """
70
+ Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
71
+
72
+ Designed for long audio recordings, this function sends the audio to the ASR model
73
+ optimized for processing extended durations.
74
+
75
+ Parameters
76
+ ----------
77
+ audio_path : str
78
+ Path to the long audio file to be transcribed.
79
+
80
+ Returns
81
+ -------
82
+ str
83
+ Transcribed text returned by the remote ASR service.
84
+ """
85
+ result = _get_asr_client().predict(
86
+ wav_path=handle_file(audio_path),
87
+ api_name="/transcribe_long_audio"
88
+ )
89
+ return result
90
+
91
+
92
+ def transcribe_short_audio(audio_path: str) -> str:
93
+ """
94
+ Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
95
+
96
+ This function is optimized for short-duration audio samples and produces fast transcriptions.
97
+
98
+ Parameters
99
+ ----------
100
+ audio_path : str
101
+ Path to the short audio file to be transcribed.
102
+
103
+ Returns
104
+ -------
105
+ str
106
+ Transcribed text returned by the remote service.
107
+ """
108
+ result = _get_asr_client().predict(
109
+ wav_path=handle_file(audio_path),
110
+ api_name="/transcribe_wav"
111
+ )
112
+ return result
113
+
114
+
115
+ def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
116
+ """
117
+ Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
118
+
119
+ This function attempts to identify which known speaker (from a provided
120
+ collection of voice profiles) appears in the given audio clip.
121
+
122
+ Parameters
123
+ ----------
124
+ clip_path : str
125
+ Path to the audio clip whose speaker is to be identified.
126
+ voice_col : List[Dict[str, Any]]
127
+ List of dictionaries containing metadata or embeddings for known voices.
128
+
129
+ Returns
130
+ -------
131
+ Any
132
+ Output returned by the remote speaker identification model.
133
+ """
134
+ voice_col_str = json.dumps(voice_col)
135
+ result = _get_asr_client().predict(
136
+ wav_file=handle_file(clip_path),
137
+ voice_col=voice_col_str,
138
+ api_name="/identificar_veu"
139
+ )
140
+ return result
141
+
142
+
143
+ def get_voice_embedding(audio_path: str) -> List[float]:
144
+ """
145
+ Call the /voice_embedding endpoint to get a voice embedding vector.
146
+
147
+ This replaces local SpeakerRecognition processing by delegating to asr Space.
148
+
149
+ Parameters
150
+ ----------
151
+ audio_path : str
152
+ Path to the audio file (WAV format preferred).
153
+
154
+ Returns
155
+ -------
156
+ List[float]
157
+ Normalized embedding vector for the voice, or empty list on error.
158
+ """
159
+ try:
160
+ result = _get_asr_client().predict(
161
+ wav_file=handle_file(audio_path),
162
+ api_name="/voice_embedding"
163
+ )
164
+ return result if result else []
165
+ except Exception as e:
166
+ print(f"[asr_client] get_voice_embedding error: {e}")
167
+ return []
168
+
169
+
170
+ def extract_audio_and_diarize(video_path: str) -> Dict[str, Any]:
171
+ """
172
+ Extract audio from video and perform diarization in one call.
173
+
174
+ Parameters
175
+ ----------
176
+ video_path : str
177
+ Path to the input video file.
178
+
179
+ Returns
180
+ -------
181
+ Dict[str, Any]
182
+ Dictionary with 'clips' (list of audio file paths) and 'segments' (diarization info).
183
+ """
184
+ try:
185
+ # First extract audio
186
+ audio_path = extract_audio_from_video(video_path)
187
+ if not audio_path:
188
+ return {"clips": [], "segments": [], "error": "Audio extraction failed"}
189
+
190
+ # Then diarize
191
+ result = diarize_audio(audio_path)
192
+ # result is tuple: (clips_paths, segments)
193
+ if result and len(result) >= 2:
194
+ return {
195
+ "clips": result[0] if result[0] else [],
196
+ "segments": result[1] if result[1] else [],
197
+ "audio_path": audio_path,
198
+ }
199
+ return {"clips": [], "segments": [], "audio_path": audio_path}
200
+ except Exception as e:
201
+ print(f"[asr_client] extract_audio_and_diarize error: {e}")
202
+ return {"clips": [], "segments": [], "error": str(e)}