Spaces:

Warholt
/

CaroTTS-DE

Running on Zero

App Files Files Community

CaroTTS-DE / app.py

Warholt

Add app, track models via lfs

a1e382b 24 days ago

raw

history blame

4.26 kB

	import gradio as gr
	import onnxruntime as ort
	import numpy as np
	from char_tokenizers import GermanCharsTokenizer

	# Initialize tokenizer
	TOKENIZER = GermanCharsTokenizer()

	# Model paths
	MODELS = {
	"Caro": {
	"fastpitch": "onnx/caro_fastpitch.onnx",
	"hifigan": "onnx/caro_hifigan.onnx",
	},
	"Karlsson": {
	"fastpitch": "onnx/karlsson_fastpitch.onnx",
	"hifigan": "onnx/karlsson_hifigan.onnx",
	},
	}

	# Load models
	print("Loading ONNX models...")
	sessions = {}
	for voice_name, paths in MODELS.items():
	print(f"Loading {voice_name}...")
	sessions[voice_name] = {
	"fastpitch": ort.InferenceSession(paths["fastpitch"]),
	"hifigan": ort.InferenceSession(paths["hifigan"]),
	}
	print("Models loaded successfully!")


	def synthesize_speech(text: str, voice: str, pace: float = 1.0):
	"""
	Synthesize speech from text using the selected voice.

	Args:
	text: Input text to synthesize
	voice: Voice to use (Caro or Karlsson)
	pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)

	Returns:
	Tuple of (sample_rate, audio_array)
	"""
	if not text.strip():
	return None

	# Tokenize text
	tokens = TOKENIZER.encode(text)

	# Prepare inputs for FastPitch
	paces = np.zeros(len(tokens), dtype=np.float32) + pace
	pitches = np.zeros(len(tokens), dtype=np.float32) # Keep pitch at 0.0

	inputs = {
	"text": np.array([tokens], dtype=np.int64),
	"pace": np.array([paces], dtype=np.float32),
	"pitch": np.array([pitches], dtype=np.float32),
	}

	# Generate spectrogram with FastPitch
	fastpitch_session = sessions[voice]["fastpitch"]
	spec = fastpitch_session.run(None, inputs)[0]

	# Generate audio with HiFiGAN
	hifigan_session = sessions[voice]["hifigan"]
	gan_inputs = {"spec": spec}
	audio = hifigan_session.run(None, gan_inputs)[0]

	# Return sample rate and audio
	sample_rate = 44100
	audio_array = audio.squeeze()

	return (sample_rate, audio_array)


	# Create Gradio interface
	with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
	gr.Markdown(
	"""
	# 🎙️ German Text-to-Speech

	Generate German speech using two different voices: Caro and Karlsson.

	Enter your German text below and select a voice to synthesize speech.
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to synthesize",
	placeholder="Geben Sie hier Ihren deutschen Text ein...",
	lines=5,
	value="Hallo! Willkommen zur deutschen Sprachsynthese.",
	)

	voice_dropdown = gr.Dropdown(
	choices=list(MODELS.keys()), label="Voice", value="Karlsson"
	)

	pace_slider = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Speaking Rate",
	info="1.0 is normal speed",
	)

	generate_btn = gr.Button("Generate Speech 🔊", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Generated Audio", type="numpy")

	gr.Examples(
	examples=[
	["Guten Tag! Wie geht es Ihnen heute?", "Caro", 1.0],
	[
	"Die Wissenschaft hat in den letzten Jahren große Fortschritte gemacht.",
	"Karlsson",
	1.0,
	],
	[
	"Es war einmal ein kleines Mädchen, das durch den Wald spazierte.",
	"Caro",
	0.9,
	],
	[
	"Berlin ist die Hauptstadt und zugleich ein Land der Bundesrepublik Deutschland.",
	"Karlsson",
	1.0,
	],
	],
	inputs=[text_input, voice_dropdown, pace_slider],
	outputs=audio_output,
	fn=synthesize_speech,
	cache_examples=False,
	)

	generate_btn.click(
	fn=synthesize_speech,
	inputs=[text_input, voice_dropdown, pace_slider],
	outputs=audio_output,
	)

	if __name__ == "__main__":
	demo.launch()