Alexey
commited on
fix: convert all audio to WAV 16kHz PCM before processing (#379)
Browse files- musetalk/data/dataset.py +7 -4
- musetalk/utils/audio_utils.py +17 -0
musetalk/data/dataset.py
CHANGED
|
@@ -15,6 +15,7 @@ from decord.ndarray import cpu
|
|
| 15 |
|
| 16 |
from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
|
| 17 |
from musetalk.data import audio
|
|
|
|
| 18 |
|
| 19 |
syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync
|
| 20 |
|
|
@@ -171,7 +172,8 @@ class FaceDataset(Dataset):
|
|
| 171 |
"""
|
| 172 |
if not os.path.exists(wav_path):
|
| 173 |
return None
|
| 174 |
-
|
|
|
|
| 175 |
assert sampling_rate == 16000
|
| 176 |
|
| 177 |
while start_index >= 25 * 30:
|
|
@@ -206,11 +208,12 @@ class FaceDataset(Dataset):
|
|
| 206 |
if not os.path.exists(wav_path):
|
| 207 |
return None
|
| 208 |
|
| 209 |
-
|
|
|
|
| 210 |
assert sampling_rate == 16000
|
| 211 |
|
| 212 |
-
|
| 213 |
-
return
|
| 214 |
|
| 215 |
def mel_feature_extractor(self, audio_input):
|
| 216 |
"""Extract mel spectrogram features
|
|
|
|
| 15 |
|
| 16 |
from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
|
| 17 |
from musetalk.data import audio
|
| 18 |
+
from musetalk.utils.audio_utils import ensure_wav
|
| 19 |
|
| 20 |
syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync
|
| 21 |
|
|
|
|
| 172 |
"""
|
| 173 |
if not os.path.exists(wav_path):
|
| 174 |
return None
|
| 175 |
+
wav_path_converted = ensure_wav(wav_path)
|
| 176 |
+
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
|
| 177 |
assert sampling_rate == 16000
|
| 178 |
|
| 179 |
while start_index >= 25 * 30:
|
|
|
|
| 208 |
if not os.path.exists(wav_path):
|
| 209 |
return None
|
| 210 |
|
| 211 |
+
wav_path_converted = ensure_wav(wav_path)
|
| 212 |
+
audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
|
| 213 |
assert sampling_rate == 16000
|
| 214 |
|
| 215 |
+
audio_mel = self.mel_feature_extractor(audio_input_librosa)
|
| 216 |
+
return audio_mel, start_index
|
| 217 |
|
| 218 |
def mel_feature_extractor(self, audio_input):
|
| 219 |
"""Extract mel spectrogram features
|
musetalk/utils/audio_utils.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, subprocess
|
| 2 |
+
|
| 3 |
+
def ensure_wav(input_path: str, target_path: str | None = None) -> str:
|
| 4 |
+
"""
|
| 5 |
+
Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg.
|
| 6 |
+
Returns path to the converted .wav (original if already correct).
|
| 7 |
+
"""
|
| 8 |
+
if not isinstance(input_path, str) or not os.path.exists(input_path):
|
| 9 |
+
return input_path
|
| 10 |
+
base, ext = os.path.splitext(input_path)
|
| 11 |
+
ext = ext.lower()
|
| 12 |
+
|
| 13 |
+
if target_path is None:
|
| 14 |
+
target_path = base + "_16k.wav"
|
| 15 |
+
cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path]
|
| 16 |
+
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 17 |
+
return target_path
|