Spaces:
Sleeping
Sleeping
| import torch | |
| import torchaudio | |
| import torchvision | |
| resample_rate = 16000 | |
| def process_audio_data(waveform, sample_rate): | |
| try: | |
| waveform = waveform[0] # 使用左声道 | |
| waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=resample_rate)(waveform) | |
| if waveform.size(0) < 3 * resample_rate: | |
| waveform = torch.nn.functional.pad(waveform, (0, 3 * resample_rate - waveform.size(0))) | |
| else: | |
| waveform = waveform[: 3 * resample_rate] | |
| mfcc = torchaudio.transforms.MFCC( | |
| sample_rate=resample_rate, | |
| n_mfcc=13, | |
| melkwargs={ | |
| "n_fft": 256, | |
| "win_length": 256, | |
| "hop_length": 128, | |
| "n_mels": 40, | |
| } | |
| )(waveform) | |
| return mfcc | |
| except Exception as e: | |
| print(f"ERR!: Error in audio processing: {e}") | |
| return None | |
| def process_image_data(image): | |
| try: | |
| image = torchvision.transforms.Resize((1080, 1080))(image) | |
| image = image / 255.0 | |
| image = torchvision.transforms.Normalize( | |
| mean=[0.485, 0.456, 0.406], | |
| std=[0.229, 0.224, 0.225] | |
| )(image) | |
| return image | |
| except Exception as e: | |
| print(f"ERR!: Error in image processing: {e}") | |
| return None | |