Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import onnxruntime as ort | |
| import numpy as np | |
| from char_tokenizers import GermanCharsTokenizer | |
| # Initialize tokenizer | |
| TOKENIZER = GermanCharsTokenizer() | |
| # Model paths | |
| MODELS = { | |
| "Caro": { | |
| "fastpitch": "onnx/caro_fastpitch.onnx", | |
| "hifigan": "onnx/caro_hifigan.onnx", | |
| }, | |
| "Karlsson": { | |
| "fastpitch": "onnx/karlsson_fastpitch.onnx", | |
| "hifigan": "onnx/karlsson_hifigan.onnx", | |
| }, | |
| } | |
| # Load models | |
| print("Loading ONNX models...") | |
| sessions = {} | |
| for voice_name, paths in MODELS.items(): | |
| print(f"Loading {voice_name}...") | |
| sessions[voice_name] = { | |
| "fastpitch": ort.InferenceSession(paths["fastpitch"]), | |
| "hifigan": ort.InferenceSession(paths["hifigan"]), | |
| } | |
| print("Models loaded successfully!") | |
| def synthesize_speech(text: str, voice: str, pace: float = 1.0): | |
| """ | |
| Synthesize speech from text using the selected voice. | |
| Args: | |
| text: Input text to synthesize | |
| voice: Voice to use (Caro or Karlsson) | |
| pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster) | |
| Returns: | |
| Tuple of (sample_rate, audio_array) | |
| """ | |
| if not text.strip(): | |
| return None | |
| # Tokenize text | |
| tokens = TOKENIZER.encode(text) | |
| # Prepare inputs for FastPitch | |
| paces = np.zeros(len(tokens), dtype=np.float32) + pace | |
| pitches = np.zeros(len(tokens), dtype=np.float32) # Keep pitch at 0.0 | |
| inputs = { | |
| "text": np.array([tokens], dtype=np.int64), | |
| "pace": np.array([paces], dtype=np.float32), | |
| "pitch": np.array([pitches], dtype=np.float32), | |
| } | |
| # Generate spectrogram with FastPitch | |
| fastpitch_session = sessions[voice]["fastpitch"] | |
| spec = fastpitch_session.run(None, inputs)[0] | |
| # Generate audio with HiFiGAN | |
| hifigan_session = sessions[voice]["hifigan"] | |
| gan_inputs = {"spec": spec} | |
| audio = hifigan_session.run(None, gan_inputs)[0] | |
| # Return sample rate and audio | |
| sample_rate = 44100 | |
| audio_array = audio.squeeze() | |
| return (sample_rate, audio_array) | |
| # Create Gradio interface | |
| with gr.Blocks(title="German TTS - Caro & Karlsson") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎙️ German Text-to-Speech | |
| Generate German speech using two different voices: **Caro** and **Karlsson**. | |
| Enter your German text below and select a voice to synthesize speech. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to synthesize", | |
| placeholder="Geben Sie hier Ihren deutschen Text ein...", | |
| lines=5, | |
| value="Hallo! Willkommen zur deutschen Sprachsynthese.", | |
| ) | |
| voice_dropdown = gr.Dropdown( | |
| choices=list(MODELS.keys()), label="Voice", value="Karlsson" | |
| ) | |
| pace_slider = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="Speaking Rate", | |
| info="1.0 is normal speed", | |
| ) | |
| generate_btn = gr.Button("Generate Speech 🔊", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Generated Audio", type="numpy") | |
| gr.Examples( | |
| examples=[ | |
| ["Guten Tag! Wie geht es Ihnen heute?", "Caro", 1.0], | |
| [ | |
| "Die Wissenschaft hat in den letzten Jahren große Fortschritte gemacht.", | |
| "Karlsson", | |
| 1.0, | |
| ], | |
| [ | |
| "Es war einmal ein kleines Mädchen, das durch den Wald spazierte.", | |
| "Caro", | |
| 0.9, | |
| ], | |
| [ | |
| "Berlin ist die Hauptstadt und zugleich ein Land der Bundesrepublik Deutschland.", | |
| "Karlsson", | |
| 1.0, | |
| ], | |
| ], | |
| inputs=[text_input, voice_dropdown, pace_slider], | |
| outputs=audio_output, | |
| fn=synthesize_speech, | |
| cache_examples=False, | |
| ) | |
| generate_btn.click( | |
| fn=synthesize_speech, | |
| inputs=[text_input, voice_dropdown, pace_slider], | |
| outputs=audio_output, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |