import gradio as gr
import torch
from soprano import SopranoTTS
import numpy as np
import socket
import time
import spaces
# Detect device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = None
# Initialize model
@spaces.GPU
def load_model():
global model
if model is None:
model = SopranoTTS(
backend="auto",
device=DEVICE,
cache_size_mb=10000,
decoder_batch_size=8,
)
return model
SAMPLE_RATE = 32000
@spaces.GPU
def generate_speech(
text: str,
temperature: float = 0.3,
top_p: float = 0.95,
repetition_penalty: float = 1.2,
):
"""
Runs Soprano text-to-speech model with the given input text and sampling parameters.
Returns:
((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
"""
if not text.strip():
yield None, "Please enter some text to generate speech."
return
try: print(text.split('\n')[0])
except: pass
try:
yield None, "⏳ Loading model..."
model = load_model()
yield None, "⏳ Generating audio..."
start_time = time.perf_counter()
audio = model.infer(
text,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
)
gen_time = time.perf_counter() - start_time
audio_np = audio.cpu().numpy()
audio_int16 = (audio_np * 32767).astype(np.int16)
audio_seconds = len(audio_np) / SAMPLE_RATE
rtf = audio_seconds / gen_time if gen_time > 0 else float("inf")
status = (
f"✓ Generated {audio_seconds:.2f} s audio | "
f"Generation time: {gen_time:.3f} s "
f"({rtf:.2f}x realtime)"
)
yield (SAMPLE_RATE, audio_int16), status
return
except Exception as e:
yield None, f"✗ Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Soprano TTS") as demo:
gr.Markdown(
f"""
# 🗣️ Soprano TTS
**Running on: {DEVICE.upper()}**
**GitHub:** https://github.com/ekwek1/soprano
**Model Weights:** https://huggingface.co/ekwek/Soprano-1.1-80M
**Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
"""
)
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter text here...",
value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
lines=5,
max_lines=10,
)
with gr.Accordion("Advanced Settings", open=False):
temperature = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.0,
step=0.05,
label="Temperature",
)
top_p = gr.Slider(
minimum=0.5,
maximum=1.0,
value=0.95,
step=0.05,
label="Top P",
)
repetition_penalty = gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.2,
step=0.1,
label="Repetition Penalty",
)
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
autoplay=True,
)
status_output = gr.Textbox(
label="Status",
interactive=False,
lines=3,
max_lines=10
)
gr.Examples(
examples=[
["Soprano is an extremely lightweight text to speech model.", 0.0, 0.95, 1.2],
["Artificial intelligence is transforming the world.", 0.0, 0.90, 1.2],
["I'm so excited, I can't even wait!", 0.0, 0.95, 1.2],
["Why don't you go ahead and try it?", 0.0, 0.95, 1.2],
],
inputs=[text_input, temperature, top_p, repetition_penalty],
label="Example Prompts",
)
generate_btn.click(
fn=generate_speech,
inputs=[text_input, temperature, top_p, repetition_penalty],
outputs=[audio_output, status_output],
)
gr.Markdown(
f"""
### Usage tips:
- Note: Soprano is currently **English-only**. Other languages are not guaranteed to work.
- Soprano works best when each sentence is between 2 and 30 seconds long.
- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
Best results can be achieved by converting these into their phonetic form.
(1+1 -> one plus one, etc)
- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
You may also change the sampling settings for more varied results.
- Avoid improper grammar such as not using contractions, multiple spaces, etc.
"""
)
def main():
# Start Gradio interface
demo.launch(
mcp_server=True,
theme=gr.themes.Soft(primary_hue="green"),
css="""
a {
color: var(--primary-600);
}
a:hover {
color: var(--primary-700);
}
"""
)
if __name__ == "__main__":
main()