fix: convert all audio to WAV 16kHz PCM before processing (#379)

Files changed (2) hide show

musetalk/data/dataset.py CHANGED Viewed

@@ -15,6 +15,7 @@ from decord.ndarray import cpu
 from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
 from musetalk.data import audio
 syncnet_mel_step_size = math.ceil(16 / 5 * 16)  # latentsync
@@ -171,7 +172,8 @@ class FaceDataset(Dataset):
         """
         if not os.path.exists(wav_path):
             return None
-        audio_input_librosa, sampling_rate = librosa.load(wav_path, sr=16000)
         assert sampling_rate == 16000
         while start_index >= 25 * 30:
@@ -206,11 +208,12 @@ class FaceDataset(Dataset):
         if not os.path.exists(wav_path):
             return None
-        audio_input, sampling_rate = librosa.load(wav_path, sr=16000)
         assert sampling_rate == 16000
-        audio_input = self.mel_feature_extractor(audio_input)
-        return audio_input, start_index
     def mel_feature_extractor(self, audio_input):
         """Extract mel spectrogram features

 from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
 from musetalk.data import audio
+from musetalk.utils.audio_utils import ensure_wav
 syncnet_mel_step_size = math.ceil(16 / 5 * 16)  # latentsync
         """
         if not os.path.exists(wav_path):
             return None
+        wav_path_converted = ensure_wav(wav_path)
+        audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
         assert sampling_rate == 16000
         while start_index >= 25 * 30:
         if not os.path.exists(wav_path):
             return None
+        wav_path_converted = ensure_wav(wav_path)
+        audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
         assert sampling_rate == 16000
+        audio_mel = self.mel_feature_extractor(audio_input_librosa)
+        return audio_mel, start_index
     def mel_feature_extractor(self, audio_input):
         """Extract mel spectrogram features

musetalk/utils/audio_utils.py ADDED Viewed

+import os, subprocess
+def ensure_wav(input_path: str, target_path: str | None = None) -> str:
+    """
+    Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg.
+    Returns path to the converted .wav (original if already correct).
+    """
+    if not isinstance(input_path, str) or not os.path.exists(input_path):
+        return input_path
+    base, ext = os.path.splitext(input_path)
+    ext = ext.lower()
+    if target_path is None:
+        target_path = base + "_16k.wav"
+    cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path]
+    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    return target_path