Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import torch | |
| from soprano import SopranoTTS | |
| import numpy as np | |
| import socket | |
| import time | |
| import spaces | |
| # Detect device | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = None | |
| # Initialize model | |
| def load_model(): | |
| global model | |
| if model is None: | |
| model = SopranoTTS( | |
| backend="auto", | |
| device=DEVICE, | |
| cache_size_mb=10000, | |
| decoder_batch_size=8, | |
| ) | |
| return model | |
| SAMPLE_RATE = 32000 | |
| def generate_speech( | |
| text: str, | |
| temperature: float = 0.3, | |
| top_p: float = 0.95, | |
| repetition_penalty: float = 1.2, | |
| ): | |
| """ | |
| Runs Soprano text-to-speech model with the given input text and sampling parameters. | |
| Returns: | |
| ((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text. | |
| """ | |
| if not text.strip(): | |
| yield None, "Please enter some text to generate speech." | |
| return | |
| try: print(text.split('\n')[0]) | |
| except: pass | |
| try: | |
| yield None, "⏳ Loading model..." | |
| model = load_model() | |
| yield None, "⏳ Generating audio..." | |
| start_time = time.perf_counter() | |
| audio = model.infer( | |
| text, | |
| temperature=temperature, | |
| top_p=top_p, | |
| repetition_penalty=repetition_penalty, | |
| ) | |
| gen_time = time.perf_counter() - start_time | |
| audio_np = audio.cpu().numpy() | |
| audio_int16 = (audio_np * 32767).astype(np.int16) | |
| audio_seconds = len(audio_np) / SAMPLE_RATE | |
| rtf = audio_seconds / gen_time if gen_time > 0 else float("inf") | |
| status = ( | |
| f"✓ Generated {audio_seconds:.2f} s audio | " | |
| f"Generation time: {gen_time:.3f} s " | |
| f"({rtf:.2f}x realtime)" | |
| ) | |
| yield (SAMPLE_RATE, audio_int16), status | |
| return | |
| except Exception as e: | |
| yield None, f"✗ Error: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Soprano TTS") as demo: | |
| gr.Markdown( | |
| f""" | |
| # 🗣️ Soprano TTS | |
| <div align="center"> | |
| <img width="300" height="300" alt="soprano-github" src="https://github.com/user-attachments/assets/4d612eac-23b8-44e6-8c59-d7ac14ebafd1" /> | |
| </div> | |
| **Running on: {DEVICE.upper()}** | |
| **GitHub:** https://github.com/ekwek1/soprano | |
| **Model Weights:** https://huggingface.co/ekwek/Soprano-1.1-80M | |
| **Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| placeholder="Enter text here...", | |
| value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", | |
| lines=5, | |
| max_lines=10, | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.0, | |
| step=0.05, | |
| label="Temperature", | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.5, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top P", | |
| ) | |
| repetition_penalty = gr.Slider( | |
| minimum=1.0, | |
| maximum=2.0, | |
| value=1.2, | |
| step=0.1, | |
| label="Repetition Penalty", | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="numpy", | |
| autoplay=True, | |
| ) | |
| status_output = gr.Textbox( | |
| label="Status", | |
| interactive=False, | |
| lines=3, | |
| max_lines=10 | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["Soprano is an extremely lightweight text to speech model.", 0.0, 0.95, 1.2], | |
| ["Artificial intelligence is transforming the world.", 0.0, 0.90, 1.2], | |
| ["I'm so excited, I can't even wait!", 0.0, 0.95, 1.2], | |
| ["Why don't you go ahead and try it?", 0.0, 0.95, 1.2], | |
| ], | |
| inputs=[text_input, temperature, top_p, repetition_penalty], | |
| label="Example Prompts", | |
| ) | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, temperature, top_p, repetition_penalty], | |
| outputs=[audio_output, status_output], | |
| ) | |
| gr.Markdown( | |
| f""" | |
| ### Usage tips: | |
| - Note: Soprano is currently **English-only**. Other languages are not guaranteed to work. | |
| - Soprano works best when each sentence is between 2 and 30 seconds long. | |
| - Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. | |
| Best results can be achieved by converting these into their phonetic form. | |
| (1+1 -> one plus one, etc) | |
| - If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. | |
| You may also change the sampling settings for more varied results. | |
| - Avoid improper grammar such as not using contractions, multiple spaces, etc. | |
| """ | |
| ) | |
| def main(): | |
| # Start Gradio interface | |
| demo.launch( | |
| mcp_server=True, | |
| theme=gr.themes.Soft(primary_hue="green"), | |
| css=""" | |
| a { | |
| color: var(--primary-600); | |
| } | |
| a:hover { | |
| color: var(--primary-700); | |
| } | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| main() |