Spaces:

jasspier
/

asr_arena

Runtime error

App Files Files Community

jasspier commited on May 29, 2024

Commit

ca625d0

verified ·

1 Parent(s): 4181e49

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -22

app.py CHANGED Viewed

@@ -2,18 +2,7 @@ import gradio as gr
 import torch
 import torchaudio
 from torchaudio.transforms import Resample
-# 定义一个假设的 ASR 模型结构
-class ASRModel(torch.nn.Module):
-    def __init__(self):
-        super(ASRModel, self).__init__()
-        self.lstm = torch.nn.LSTM(input_size=160, hidden_size=256, num_layers=3, batch_first=True)
-        self.linear = torch.nn.Linear(256, 29)  # 假设有 29 个输出类用于字符
-    def forward(self, x):
-        x, _ = self.lstm(x)
-        x = self.linear(x)
-        return x
 # 定义模型路径
 model_path = "https://huggingface.co/Tele-AI/TeleSpeech-ASR1.0/resolve/main/finetune_large_kespeech.pt"
@@ -23,8 +12,9 @@ print("Downloading model file...")
 torch.hub.download_url_to_file(model_path, 'large.pt')
 print("Model file downloaded.")
-# 初始化模型
-model = ASRModel()
 # 加载模型参数
 print("Loading model checkpoint...")
@@ -56,16 +46,11 @@ def transcribe(audio):
     waveform = resample(waveform).squeeze()
     # 将输入数据转换为符合模型预期的形状
-    num_frames = waveform.size(0)
-    if num_frames % 160 != 0:
-        # 如果样本数量不是160的倍数，则填充样本
-        num_frames_padded = ((num_frames // 160) + 1) * 160
-        padding = num_frames_padded - num_frames
-        waveform = torch.nn.functional.pad(waveform, (0, padding))
-    input_values = waveform.view(-1, 160).unsqueeze(0)  # 确保输入形状为 (batch_size, seq_len, input_size)
     with torch.no_grad():
-        logits = model(input_values)
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = ''.join([chr(i) for i in predicted_ids[0].tolist()])  # 解码预测到字符
     print("Transcription:", transcription)

 import torch
 import torchaudio
 from torchaudio.transforms import Resample
+from data2vec2 import Data2VecMultiModel, Data2VecMultiConfig, Modality
 # 定义模型路径
 model_path = "https://huggingface.co/Tele-AI/TeleSpeech-ASR1.0/resolve/main/finetune_large_kespeech.pt"
 torch.hub.download_url_to_file(model_path, 'large.pt')
 print("Model file downloaded.")
+# 加载模型配置和初始化模型
+config = Data2VecMultiConfig()
+model = Data2VecMultiModel(config, modalities=[Modality.AUDIO])
 # 加载模型参数
 print("Loading model checkpoint...")
     waveform = resample(waveform).squeeze()
     # 将输入数据转换为符合模型预期的形状
+    input_values = waveform.unsqueeze(0)  # (batch_size, seq_len)
     with torch.no_grad():
+        outputs = model.extract_features(input_values, mode='AUDIO')
+        logits = outputs["x"]
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = ''.join([chr(i) for i in predicted_ids[0].tolist()])  # 解码预测到字符
     print("Transcription:", transcription)