Upload 2 files
Browse files- audio_tools.py +17 -7
audio_tools.py
CHANGED
|
@@ -115,7 +115,7 @@ def transcribe_audio_remote(audio_path: str | Path, cfg: Dict[str, Any]) -> Dict
|
|
| 115 |
model_name = (cfg.get("models", {}).get("asr") or "whisper-catalan")
|
| 116 |
params = {
|
| 117 |
"language": "ca",
|
| 118 |
-
|
| 119 |
"timestamps": True,
|
| 120 |
"diarization": False, # diarization stays local
|
| 121 |
}
|
|
@@ -252,6 +252,12 @@ class VoiceEmbedder:
|
|
| 252 |
self.model.eval()
|
| 253 |
|
| 254 |
def embed(self, wav_path: str) -> List[float]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
if HAS_TORCHAUDIO:
|
| 256 |
waveform, sr = ta.load(wav_path)
|
| 257 |
target_sr = 16000
|
|
@@ -262,9 +268,12 @@ class VoiceEmbedder:
|
|
| 262 |
min_samples = int(0.2 * target_sr)
|
| 263 |
if waveform.shape[1] < min_samples:
|
| 264 |
pad = min_samples - waveform.shape[1]
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
|
|
|
|
|
|
|
|
|
| 268 |
emb = self.model.encode_batch(waveform).squeeze().cpu().numpy().astype(float)
|
| 269 |
return emb.tolist()
|
| 270 |
else:
|
|
@@ -272,9 +281,10 @@ class VoiceEmbedder:
|
|
| 272 |
min_len = int(0.2 * 16000)
|
| 273 |
if len(y) < min_len:
|
| 274 |
y = np.pad(y, (0, min_len - len(y)))
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
|
|
|
| 278 |
emb = self.model.encode_batch(w).squeeze().cpu().numpy().astype(float)
|
| 279 |
return emb.tolist()
|
| 280 |
|
|
|
|
| 115 |
model_name = (cfg.get("models", {}).get("asr") or "whisper-catalan")
|
| 116 |
params = {
|
| 117 |
"language": "ca",
|
| 118 |
+
# remote ASR model is configured server-side; avoid 'model' to not clash with router arg
|
| 119 |
"timestamps": True,
|
| 120 |
"diarization": False, # diarization stays local
|
| 121 |
}
|
|
|
|
| 252 |
self.model.eval()
|
| 253 |
|
| 254 |
def embed(self, wav_path: str) -> List[float]:
|
| 255 |
+
# ensure we have a torch handle without creating a local var that shadows outer scope
|
| 256 |
+
try:
|
| 257 |
+
import torch as _torch # local alias, avoids scoping issues
|
| 258 |
+
except Exception:
|
| 259 |
+
_torch = None # type: ignore
|
| 260 |
+
|
| 261 |
if HAS_TORCHAUDIO:
|
| 262 |
waveform, sr = ta.load(wav_path)
|
| 263 |
target_sr = 16000
|
|
|
|
| 268 |
min_samples = int(0.2 * target_sr)
|
| 269 |
if waveform.shape[1] < min_samples:
|
| 270 |
pad = min_samples - waveform.shape[1]
|
| 271 |
+
if _torch is None:
|
| 272 |
+
raise RuntimeError("Torch not available for padding")
|
| 273 |
+
waveform = _torch.cat([waveform, _torch.zeros((1, pad))], dim=1)
|
| 274 |
+
if _torch is None:
|
| 275 |
+
raise RuntimeError("Torch not available for inference")
|
| 276 |
+
with _torch.no_grad(): # type: ignore
|
| 277 |
emb = self.model.encode_batch(waveform).squeeze().cpu().numpy().astype(float)
|
| 278 |
return emb.tolist()
|
| 279 |
else:
|
|
|
|
| 281 |
min_len = int(0.2 * 16000)
|
| 282 |
if len(y) < min_len:
|
| 283 |
y = np.pad(y, (0, min_len - len(y)))
|
| 284 |
+
if _torch is None:
|
| 285 |
+
raise RuntimeError("Torch not available for inference")
|
| 286 |
+
w = _torch.from_numpy(y).unsqueeze(0).unsqueeze(0)
|
| 287 |
+
with _torch.no_grad(): # type: ignore
|
| 288 |
emb = self.model.encode_batch(w).squeeze().cpu().numpy().astype(float)
|
| 289 |
return emb.tolist()
|
| 290 |
|