Spaces:

Warholt
/

CaroTTS-DE

Running on Zero

App Files Files Community

Warholt commited on 18 days ago

Commit

4001385

1 Parent(s): 9ce51ee

add zero gpu supported inference

Browse files

Files changed (2) hide show

.gitattributes +1 -1
app.py +136 -17

.gitattributes CHANGED Viewed

@@ -34,4 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,13 +1,44 @@
 import gradio as gr
 import onnxruntime as ort
 import numpy as np
 from char_tokenizers import GermanCharsTokenizer
 # Initialize tokenizer
 TOKENIZER = GermanCharsTokenizer()
 # Model paths
-MODELS = {
     "Caro": {
         "fastpitch": "onnx/caro_fastpitch.onnx",
         "hifigan": "onnx/caro_hifigan.onnx",
@@ -18,21 +49,83 @@ MODELS = {
     },
 }
-# Load models
-print("Loading ONNX models...")
-sessions = {}
-for voice_name, paths in MODELS.items():
-    print(f"Loading {voice_name}...")
-    sessions[voice_name] = {
-        "fastpitch": ort.InferenceSession(paths["fastpitch"]),
-        "hifigan": ort.InferenceSession(paths["hifigan"]),
-    }
-print("Models loaded successfully!")
-def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     """
-    Synthesize speech from text using the selected voice.
     Args:
         text: Input text to synthesize
@@ -59,11 +152,11 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     }
     # Generate spectrogram with FastPitch
-    fastpitch_session = sessions[voice]["fastpitch"]
     spec = fastpitch_session.run(None, inputs)[0]
     # Generate audio with HiFiGAN
-    hifigan_session = sessions[voice]["hifigan"]
     gan_inputs = {"spec": spec}
     audio = hifigan_session.run(None, gan_inputs)[0]
@@ -74,14 +167,40 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     return (sample_rate, audio_array)
 # Create Gradio interface
 with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
     gr.Markdown(
-        """
         # 🎙️ German Text-to-Speech
         Generate German speech using two different voices: **Caro** and **Karlsson**.
         Enter your German text below and select a voice to synthesize speech.
         """
     )
@@ -96,7 +215,7 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
             )
             voice_dropdown = gr.Dropdown(
-                choices=list(MODELS.keys()), label="Voice", value="Karlsson"
             )
             pace_slider = gr.Slider(

 import gradio as gr
 import onnxruntime as ort
 import numpy as np
+import torch
+import torch._inductor
 from char_tokenizers import GermanCharsTokenizer
+# Try to import spaces for Zero GPU support
+try:
+    import spaces
+    HAS_SPACES = True
+except ImportError:
+    HAS_SPACES = False
+    print("spaces not available, running without Zero GPU support")
 # Initialize tokenizer
 TOKENIZER = GermanCharsTokenizer()
+# Check if CUDA is available
+USE_GPU = torch.cuda.is_available()
+DEVICE = "cuda" if USE_GPU else "cpu"
+print(f"Using device: {DEVICE}")
+print(f"Zero GPU support: {HAS_SPACES}")
 # Model paths
+AOT_MODELS = {
+    "Caro": {
+        "encoder": "aot_package/caro_fastpitch_encoder.pt2",
+        "decoder": "aot_package/caro_fastpitch_decoder.pt2",
+        "vocoder": "aot_package/caro_hifigan.pt2",
+    },
+    "Karlsson": {
+        "encoder": "aot_package/karlsson_fastpitch_encoder.pt2",
+        "decoder": "aot_package/karlsson_fastpitch_decoder.pt2",
+        "vocoder": "aot_package/karlsson_hifigan.pt2",
+    },
+}
+ONNX_MODELS = {
     "Caro": {
         "fastpitch": "onnx/caro_fastpitch.onnx",
         "hifigan": "onnx/caro_hifigan.onnx",
     },
 }
+# Load models based on device
+if USE_GPU:
+    print("Loading AOT models for GPU...")
+    aot_sessions = {}
+    for voice_name, paths in AOT_MODELS.items():
+        print(f"Loading {voice_name} AOT models...")
+        aot_sessions[voice_name] = {
+            "encoder": torch._inductor.aoti_load_package(paths["encoder"]),
+            "decoder": torch._inductor.aoti_load_package(paths["decoder"]),
+            "vocoder": torch._inductor.aoti_load_package(paths["vocoder"]),
+        }
+    print("AOT models loaded successfully!")
+    onnx_sessions = None
+else:
+    print("Loading ONNX models for CPU...")
+    onnx_sessions = {}
+    for voice_name, paths in ONNX_MODELS.items():
+        print(f"Loading {voice_name} ONNX models...")
+        onnx_sessions[voice_name] = {
+            "fastpitch": ort.InferenceSession(paths["fastpitch"]),
+            "hifigan": ort.InferenceSession(paths["hifigan"]),
+        }
+    print("ONNX models loaded successfully!")
+    aot_sessions = None
+def synthesize_speech_aot(
+    text: str, voice: str, pace: float = 1.0, pitch_shift: float = 0.0
+):
     """
+    Synthesize speech using AOT compiled models (GPU).
+    Args:
+        text: Input text to synthesize
+        voice: Voice to use (Caro or Karlsson)
+        pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
+        pitch_shift: Pitch adjustment (0.0 = no change)
+    Returns:
+        Tuple of (sample_rate, audio_array)
+    """
+    if not text.strip():
+        return None
+    # Tokenize text
+    tokens = TOKENIZER.encode(text)
+    tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to(DEVICE)
+    # Prepare control parameters
+    pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32) + pitch_shift
+    pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32) * pace
+    with torch.inference_mode():
+        # Run encoder to get latent representation and length
+        encoder = aot_sessions[voice]["encoder"]
+        len_regulated, dec_lens, spk_emb = encoder(
+            tokens_tensor, pitch_tensor, pace_tensor
+        )
+        # Run decoder to get mel-spectrogram
+        decoder = aot_sessions[voice]["decoder"]
+        spec = decoder(len_regulated, dec_lens, spk_emb)
+        # Run vocoder to generate audio waveform
+        vocoder = aot_sessions[voice]["vocoder"]
+        audio = vocoder(spec)
+    # Convert to numpy and return
+    sample_rate = 44100
+    audio_array = audio.squeeze().cpu().numpy()
+    return (sample_rate, audio_array)
+def synthesize_speech_onnx(text: str, voice: str, pace: float = 1.0):
+    """
+    Synthesize speech using ONNX models (CPU).
     Args:
         text: Input text to synthesize
     }
     # Generate spectrogram with FastPitch
+    fastpitch_session = onnx_sessions[voice]["fastpitch"]
     spec = fastpitch_session.run(None, inputs)[0]
     # Generate audio with HiFiGAN
+    hifigan_session = onnx_sessions[voice]["hifigan"]
     gan_inputs = {"spec": spec}
     audio = hifigan_session.run(None, gan_inputs)[0]
     return (sample_rate, audio_array)
+def synthesize_speech(text: str, voice: str, pace: float = 1.0):
+    """
+    Synthesize speech from text using the selected voice.
+    Uses AOT models on GPU or ONNX models on CPU.
+    Args:
+        text: Input text to synthesize
+        voice: Voice to use (Caro or Karlsson)
+        pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
+    Returns:
+        Tuple of (sample_rate, audio_array)
+    """
+    if USE_GPU:
+        return synthesize_speech_aot(text, voice, pace)
+    else:
+        return synthesize_speech_onnx(text, voice, pace)
+# Apply Zero GPU decorator if available
+if HAS_SPACES and USE_GPU:
+    synthesize_speech = spaces.GPU(synthesize_speech)
 # Create Gradio interface
 with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
     gr.Markdown(
+        f"""
         # 🎙️ German Text-to-Speech
         Generate German speech using two different voices: **Caro** and **Karlsson**.
+        **Running on:** {DEVICE.upper()} {"(AOT models)" if USE_GPU else "(ONNX models)"}
         Enter your German text below and select a voice to synthesize speech.
         """
     )
             )
             voice_dropdown = gr.Dropdown(
+                choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
             )
             pace_slider = gr.Slider(