Warholt commited on
Commit
4001385
·
1 Parent(s): 9ce51ee

add zero gpu supported inference

Browse files
Files changed (2) hide show
  1. .gitattributes +1 -1
  2. app.py +136 -17
.gitattributes CHANGED
@@ -34,4 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.onnx filter=lfs diff=lfs merge=lfs -text
37
-
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.onnx filter=lfs diff=lfs merge=lfs -text
37
+ *.pt2 filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,13 +1,44 @@
1
  import gradio as gr
2
  import onnxruntime as ort
3
  import numpy as np
 
 
4
  from char_tokenizers import GermanCharsTokenizer
5
 
 
 
 
 
 
 
 
 
 
6
  # Initialize tokenizer
7
  TOKENIZER = GermanCharsTokenizer()
8
 
 
 
 
 
 
 
 
9
  # Model paths
10
- MODELS = {
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  "Caro": {
12
  "fastpitch": "onnx/caro_fastpitch.onnx",
13
  "hifigan": "onnx/caro_hifigan.onnx",
@@ -18,21 +49,83 @@ MODELS = {
18
  },
19
  }
20
 
21
- # Load models
22
- print("Loading ONNX models...")
23
- sessions = {}
24
- for voice_name, paths in MODELS.items():
25
- print(f"Loading {voice_name}...")
26
- sessions[voice_name] = {
27
- "fastpitch": ort.InferenceSession(paths["fastpitch"]),
28
- "hifigan": ort.InferenceSession(paths["hifigan"]),
29
- }
30
- print("Models loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
 
33
- def synthesize_speech(text: str, voice: str, pace: float = 1.0):
 
 
34
  """
35
- Synthesize speech from text using the selected voice.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  Args:
38
  text: Input text to synthesize
@@ -59,11 +152,11 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
59
  }
60
 
61
  # Generate spectrogram with FastPitch
62
- fastpitch_session = sessions[voice]["fastpitch"]
63
  spec = fastpitch_session.run(None, inputs)[0]
64
 
65
  # Generate audio with HiFiGAN
66
- hifigan_session = sessions[voice]["hifigan"]
67
  gan_inputs = {"spec": spec}
68
  audio = hifigan_session.run(None, gan_inputs)[0]
69
 
@@ -74,14 +167,40 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
74
  return (sample_rate, audio_array)
75
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # Create Gradio interface
78
  with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
79
  gr.Markdown(
80
- """
81
  # 🎙️ German Text-to-Speech
82
 
83
  Generate German speech using two different voices: **Caro** and **Karlsson**.
84
 
 
 
85
  Enter your German text below and select a voice to synthesize speech.
86
  """
87
  )
@@ -96,7 +215,7 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
96
  )
97
 
98
  voice_dropdown = gr.Dropdown(
99
- choices=list(MODELS.keys()), label="Voice", value="Karlsson"
100
  )
101
 
102
  pace_slider = gr.Slider(
 
1
  import gradio as gr
2
  import onnxruntime as ort
3
  import numpy as np
4
+ import torch
5
+ import torch._inductor
6
  from char_tokenizers import GermanCharsTokenizer
7
 
8
+ # Try to import spaces for Zero GPU support
9
+ try:
10
+ import spaces
11
+
12
+ HAS_SPACES = True
13
+ except ImportError:
14
+ HAS_SPACES = False
15
+ print("spaces not available, running without Zero GPU support")
16
+
17
  # Initialize tokenizer
18
  TOKENIZER = GermanCharsTokenizer()
19
 
20
+ # Check if CUDA is available
21
+ USE_GPU = torch.cuda.is_available()
22
+ DEVICE = "cuda" if USE_GPU else "cpu"
23
+
24
+ print(f"Using device: {DEVICE}")
25
+ print(f"Zero GPU support: {HAS_SPACES}")
26
+
27
  # Model paths
28
+ AOT_MODELS = {
29
+ "Caro": {
30
+ "encoder": "aot_package/caro_fastpitch_encoder.pt2",
31
+ "decoder": "aot_package/caro_fastpitch_decoder.pt2",
32
+ "vocoder": "aot_package/caro_hifigan.pt2",
33
+ },
34
+ "Karlsson": {
35
+ "encoder": "aot_package/karlsson_fastpitch_encoder.pt2",
36
+ "decoder": "aot_package/karlsson_fastpitch_decoder.pt2",
37
+ "vocoder": "aot_package/karlsson_hifigan.pt2",
38
+ },
39
+ }
40
+
41
+ ONNX_MODELS = {
42
  "Caro": {
43
  "fastpitch": "onnx/caro_fastpitch.onnx",
44
  "hifigan": "onnx/caro_hifigan.onnx",
 
49
  },
50
  }
51
 
52
+ # Load models based on device
53
+ if USE_GPU:
54
+ print("Loading AOT models for GPU...")
55
+ aot_sessions = {}
56
+ for voice_name, paths in AOT_MODELS.items():
57
+ print(f"Loading {voice_name} AOT models...")
58
+ aot_sessions[voice_name] = {
59
+ "encoder": torch._inductor.aoti_load_package(paths["encoder"]),
60
+ "decoder": torch._inductor.aoti_load_package(paths["decoder"]),
61
+ "vocoder": torch._inductor.aoti_load_package(paths["vocoder"]),
62
+ }
63
+ print("AOT models loaded successfully!")
64
+ onnx_sessions = None
65
+ else:
66
+ print("Loading ONNX models for CPU...")
67
+ onnx_sessions = {}
68
+ for voice_name, paths in ONNX_MODELS.items():
69
+ print(f"Loading {voice_name} ONNX models...")
70
+ onnx_sessions[voice_name] = {
71
+ "fastpitch": ort.InferenceSession(paths["fastpitch"]),
72
+ "hifigan": ort.InferenceSession(paths["hifigan"]),
73
+ }
74
+ print("ONNX models loaded successfully!")
75
+ aot_sessions = None
76
 
77
 
78
+ def synthesize_speech_aot(
79
+ text: str, voice: str, pace: float = 1.0, pitch_shift: float = 0.0
80
+ ):
81
  """
82
+ Synthesize speech using AOT compiled models (GPU).
83
+
84
+ Args:
85
+ text: Input text to synthesize
86
+ voice: Voice to use (Caro or Karlsson)
87
+ pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
88
+ pitch_shift: Pitch adjustment (0.0 = no change)
89
+
90
+ Returns:
91
+ Tuple of (sample_rate, audio_array)
92
+ """
93
+ if not text.strip():
94
+ return None
95
+
96
+ # Tokenize text
97
+ tokens = TOKENIZER.encode(text)
98
+ tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to(DEVICE)
99
+
100
+ # Prepare control parameters
101
+ pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32) + pitch_shift
102
+ pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32) * pace
103
+
104
+ with torch.inference_mode():
105
+ # Run encoder to get latent representation and length
106
+ encoder = aot_sessions[voice]["encoder"]
107
+ len_regulated, dec_lens, spk_emb = encoder(
108
+ tokens_tensor, pitch_tensor, pace_tensor
109
+ )
110
+
111
+ # Run decoder to get mel-spectrogram
112
+ decoder = aot_sessions[voice]["decoder"]
113
+ spec = decoder(len_regulated, dec_lens, spk_emb)
114
+
115
+ # Run vocoder to generate audio waveform
116
+ vocoder = aot_sessions[voice]["vocoder"]
117
+ audio = vocoder(spec)
118
+
119
+ # Convert to numpy and return
120
+ sample_rate = 44100
121
+ audio_array = audio.squeeze().cpu().numpy()
122
+
123
+ return (sample_rate, audio_array)
124
+
125
+
126
+ def synthesize_speech_onnx(text: str, voice: str, pace: float = 1.0):
127
+ """
128
+ Synthesize speech using ONNX models (CPU).
129
 
130
  Args:
131
  text: Input text to synthesize
 
152
  }
153
 
154
  # Generate spectrogram with FastPitch
155
+ fastpitch_session = onnx_sessions[voice]["fastpitch"]
156
  spec = fastpitch_session.run(None, inputs)[0]
157
 
158
  # Generate audio with HiFiGAN
159
+ hifigan_session = onnx_sessions[voice]["hifigan"]
160
  gan_inputs = {"spec": spec}
161
  audio = hifigan_session.run(None, gan_inputs)[0]
162
 
 
167
  return (sample_rate, audio_array)
168
 
169
 
170
+ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
171
+ """
172
+ Synthesize speech from text using the selected voice.
173
+ Uses AOT models on GPU or ONNX models on CPU.
174
+
175
+ Args:
176
+ text: Input text to synthesize
177
+ voice: Voice to use (Caro or Karlsson)
178
+ pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
179
+
180
+ Returns:
181
+ Tuple of (sample_rate, audio_array)
182
+ """
183
+ if USE_GPU:
184
+ return synthesize_speech_aot(text, voice, pace)
185
+ else:
186
+ return synthesize_speech_onnx(text, voice, pace)
187
+
188
+
189
+ # Apply Zero GPU decorator if available
190
+ if HAS_SPACES and USE_GPU:
191
+ synthesize_speech = spaces.GPU(synthesize_speech)
192
+
193
+
194
  # Create Gradio interface
195
  with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
196
  gr.Markdown(
197
+ f"""
198
  # 🎙️ German Text-to-Speech
199
 
200
  Generate German speech using two different voices: **Caro** and **Karlsson**.
201
 
202
+ **Running on:** {DEVICE.upper()} {"(AOT models)" if USE_GPU else "(ONNX models)"}
203
+
204
  Enter your German text below and select a voice to synthesize speech.
205
  """
206
  )
 
215
  )
216
 
217
  voice_dropdown = gr.Dropdown(
218
+ choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
219
  )
220
 
221
  pace_slider = gr.Slider(