CaroTTS-DE / app.py
Warholt's picture
Add app, track models via lfs
a1e382b
raw
history blame
4.26 kB
import gradio as gr
import onnxruntime as ort
import numpy as np
from char_tokenizers import GermanCharsTokenizer
# Initialize tokenizer
TOKENIZER = GermanCharsTokenizer()
# Model paths
MODELS = {
"Caro": {
"fastpitch": "onnx/caro_fastpitch.onnx",
"hifigan": "onnx/caro_hifigan.onnx",
},
"Karlsson": {
"fastpitch": "onnx/karlsson_fastpitch.onnx",
"hifigan": "onnx/karlsson_hifigan.onnx",
},
}
# Load models
print("Loading ONNX models...")
sessions = {}
for voice_name, paths in MODELS.items():
print(f"Loading {voice_name}...")
sessions[voice_name] = {
"fastpitch": ort.InferenceSession(paths["fastpitch"]),
"hifigan": ort.InferenceSession(paths["hifigan"]),
}
print("Models loaded successfully!")
def synthesize_speech(text: str, voice: str, pace: float = 1.0):
"""
Synthesize speech from text using the selected voice.
Args:
text: Input text to synthesize
voice: Voice to use (Caro or Karlsson)
pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
Returns:
Tuple of (sample_rate, audio_array)
"""
if not text.strip():
return None
# Tokenize text
tokens = TOKENIZER.encode(text)
# Prepare inputs for FastPitch
paces = np.zeros(len(tokens), dtype=np.float32) + pace
pitches = np.zeros(len(tokens), dtype=np.float32) # Keep pitch at 0.0
inputs = {
"text": np.array([tokens], dtype=np.int64),
"pace": np.array([paces], dtype=np.float32),
"pitch": np.array([pitches], dtype=np.float32),
}
# Generate spectrogram with FastPitch
fastpitch_session = sessions[voice]["fastpitch"]
spec = fastpitch_session.run(None, inputs)[0]
# Generate audio with HiFiGAN
hifigan_session = sessions[voice]["hifigan"]
gan_inputs = {"spec": spec}
audio = hifigan_session.run(None, gan_inputs)[0]
# Return sample rate and audio
sample_rate = 44100
audio_array = audio.squeeze()
return (sample_rate, audio_array)
# Create Gradio interface
with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
gr.Markdown(
"""
# 🎙️ German Text-to-Speech
Generate German speech using two different voices: **Caro** and **Karlsson**.
Enter your German text below and select a voice to synthesize speech.
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to synthesize",
placeholder="Geben Sie hier Ihren deutschen Text ein...",
lines=5,
value="Hallo! Willkommen zur deutschen Sprachsynthese.",
)
voice_dropdown = gr.Dropdown(
choices=list(MODELS.keys()), label="Voice", value="Karlsson"
)
pace_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speaking Rate",
info="1.0 is normal speed",
)
generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio", type="numpy")
gr.Examples(
examples=[
["Guten Tag! Wie geht es Ihnen heute?", "Caro", 1.0],
[
"Die Wissenschaft hat in den letzten Jahren große Fortschritte gemacht.",
"Karlsson",
1.0,
],
[
"Es war einmal ein kleines Mädchen, das durch den Wald spazierte.",
"Caro",
0.9,
],
[
"Berlin ist die Hauptstadt und zugleich ein Land der Bundesrepublik Deutschland.",
"Karlsson",
1.0,
],
],
inputs=[text_input, voice_dropdown, pace_slider],
outputs=audio_output,
fn=synthesize_speech,
cache_examples=False,
)
generate_btn.click(
fn=synthesize_speech,
inputs=[text_input, voice_dropdown, pace_slider],
outputs=audio_output,
)
if __name__ == "__main__":
demo.launch()