Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on Oct 28

Commit

1eaad0c

verified ·

1 Parent(s): 648c0b6

Upload 2 files

Browse files

Files changed (1) hide show

audio_tools.py +17 -7

audio_tools.py CHANGED Viewed

@@ -115,7 +115,7 @@ def transcribe_audio_remote(audio_path: str | Path, cfg: Dict[str, Any]) -> Dict
     model_name = (cfg.get("models", {}).get("asr") or "whisper-catalan")
     params = {
         "language": "ca",
-        "model": "faster-whisper-large-v3-ca-3catparla",
         "timestamps": True,
         "diarization": False,  # diarization stays local
     }
@@ -252,6 +252,12 @@ class VoiceEmbedder:
         self.model.eval()
     def embed(self, wav_path: str) -> List[float]:
         if HAS_TORCHAUDIO:
             waveform, sr = ta.load(wav_path)
             target_sr = 16000
@@ -262,9 +268,12 @@ class VoiceEmbedder:
             min_samples = int(0.2 * target_sr)
             if waveform.shape[1] < min_samples:
                 pad = min_samples - waveform.shape[1]
-                import torch
-                waveform = torch.cat([waveform, torch.zeros((1, pad))], dim=1)
-            with torch.no_grad():  # type: ignore
                 emb = self.model.encode_batch(waveform).squeeze().cpu().numpy().astype(float)
             return emb.tolist()
         else:
@@ -272,9 +281,10 @@ class VoiceEmbedder:
             min_len = int(0.2 * 16000)
             if len(y) < min_len:
                 y = np.pad(y, (0, min_len - len(y)))
-            import torch
-            w = torch.from_numpy(y).unsqueeze(0).unsqueeze(0)
-            with torch.no_grad():  # type: ignore
                 emb = self.model.encode_batch(w).squeeze().cpu().numpy().astype(float)
             return emb.tolist()

     model_name = (cfg.get("models", {}).get("asr") or "whisper-catalan")
     params = {
         "language": "ca",
+        # remote ASR model is configured server-side; avoid 'model' to not clash with router arg
         "timestamps": True,
         "diarization": False,  # diarization stays local
     }
         self.model.eval()
     def embed(self, wav_path: str) -> List[float]:
+        # ensure we have a torch handle without creating a local var that shadows outer scope
+        try:
+            import torch as _torch  # local alias, avoids scoping issues
+        except Exception:
+            _torch = None  # type: ignore
         if HAS_TORCHAUDIO:
             waveform, sr = ta.load(wav_path)
             target_sr = 16000
             min_samples = int(0.2 * target_sr)
             if waveform.shape[1] < min_samples:
                 pad = min_samples - waveform.shape[1]
+                if _torch is None:
+                    raise RuntimeError("Torch not available for padding")
+                waveform = _torch.cat([waveform, _torch.zeros((1, pad))], dim=1)
+            if _torch is None:
+                raise RuntimeError("Torch not available for inference")
+            with _torch.no_grad():  # type: ignore
                 emb = self.model.encode_batch(waveform).squeeze().cpu().numpy().astype(float)
             return emb.tolist()
         else:
             min_len = int(0.2 * 16000)
             if len(y) < min_len:
                 y = np.pad(y, (0, min_len - len(y)))
+            if _torch is None:
+                raise RuntimeError("Torch not available for inference")
+            w = _torch.from_numpy(y).unsqueeze(0).unsqueeze(0)
+            with _torch.no_grad():  # type: ignore
                 emb = self.model.encode_batch(w).squeeze().cpu().numpy().astype(float)
             return emb.tolist()