Alexey commited on
Commit
7e359b9
·
unverified ·
1 Parent(s): 0f9ddb3

fix: convert all audio to WAV 16kHz PCM before processing (#379)

Browse files
musetalk/data/dataset.py CHANGED
@@ -15,6 +15,7 @@ from decord.ndarray import cpu
15
 
16
  from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
17
  from musetalk.data import audio
 
18
 
19
  syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync
20
 
@@ -171,7 +172,8 @@ class FaceDataset(Dataset):
171
  """
172
  if not os.path.exists(wav_path):
173
  return None
174
- audio_input_librosa, sampling_rate = librosa.load(wav_path, sr=16000)
 
175
  assert sampling_rate == 16000
176
 
177
  while start_index >= 25 * 30:
@@ -206,11 +208,12 @@ class FaceDataset(Dataset):
206
  if not os.path.exists(wav_path):
207
  return None
208
 
209
- audio_input, sampling_rate = librosa.load(wav_path, sr=16000)
 
210
  assert sampling_rate == 16000
211
 
212
- audio_input = self.mel_feature_extractor(audio_input)
213
- return audio_input, start_index
214
 
215
  def mel_feature_extractor(self, audio_input):
216
  """Extract mel spectrogram features
 
15
 
16
  from musetalk.data.sample_method import get_src_idx, shift_landmarks_to_face_coordinates, resize_landmark
17
  from musetalk.data import audio
18
+ from musetalk.utils.audio_utils import ensure_wav
19
 
20
  syncnet_mel_step_size = math.ceil(16 / 5 * 16) # latentsync
21
 
 
172
  """
173
  if not os.path.exists(wav_path):
174
  return None
175
+ wav_path_converted = ensure_wav(wav_path)
176
+ audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
177
  assert sampling_rate == 16000
178
 
179
  while start_index >= 25 * 30:
 
208
  if not os.path.exists(wav_path):
209
  return None
210
 
211
+ wav_path_converted = ensure_wav(wav_path)
212
+ audio_input_librosa, sampling_rate = librosa.load(wav_path_converted, sr=16000)
213
  assert sampling_rate == 16000
214
 
215
+ audio_mel = self.mel_feature_extractor(audio_input_librosa)
216
+ return audio_mel, start_index
217
 
218
  def mel_feature_extractor(self, audio_input):
219
  """Extract mel spectrogram features
musetalk/utils/audio_utils.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, subprocess
2
+
3
+ def ensure_wav(input_path: str, target_path: str | None = None) -> str:
4
+ """
5
+ Convert any audio (mp3/ogg/m4a/wav/…) to 16kHz mono PCM WAV via ffmpeg.
6
+ Returns path to the converted .wav (original if already correct).
7
+ """
8
+ if not isinstance(input_path, str) or not os.path.exists(input_path):
9
+ return input_path
10
+ base, ext = os.path.splitext(input_path)
11
+ ext = ext.lower()
12
+
13
+ if target_path is None:
14
+ target_path = base + "_16k.wav"
15
+ cmd = ["ffmpeg", "-y", "-i", input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", target_path]
16
+ subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
17
+ return target_path