import gradio as gr import torch import torchaudio import librosa import soundfile as sf import numpy as np import tempfile import os import logging import json import shutil import yaml from pathlib import Path from transformers import AutoProcessor, AutoModel from huggingface_hub import snapshot_download # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Global model instance indextts_model = None indextts_config = None device = "cuda" if torch.cuda.is_available() else "cpu" logger.info(f"Using device: {device}") # IndexTTS model directory MODEL_DIR = "indextts_checkpoints" # Voice management voice_database = "voice_database.json" voices_dir = Path("voices") default_voices_dir = Path("default_voices") class IndexTTS: """IndexTTS-1.5 implementation for high-quality voice cloning""" def __init__(self, model_dir, config_path): self.model_dir = model_dir self.config_path = config_path self.config = None self.gpt_model = None self.dvae_model = None self.bigvgan_generator = None self.bigvgan_discriminator = None self.device = device self.fallback_tts = None def load_config(self): """Load configuration from YAML file""" try: with open(self.config_path, 'r') as f: self.config = yaml.safe_load(f) logger.info("✅ IndexTTS config loaded successfully") return True except Exception as e: logger.error(f"❌ Failed to load config: {str(e)}") return False def load_models(self): """Load all IndexTTS model components""" try: # Load GPT model gpt_path = os.path.join(self.model_dir, "gpt.pth") if os.path.exists(gpt_path): self.gpt_model = torch.load(gpt_path, map_location=self.device) logger.info("✅ GPT model loaded") # Load DVAE model dvae_path = os.path.join(self.model_dir, "dvae.pth") if os.path.exists(dvae_path): self.dvae_model = torch.load(dvae_path, map_location=self.device) logger.info("✅ DVAE model loaded") # Load BigVGAN generator bigvgan_gen_path = os.path.join(self.model_dir, "bigvgan_generator.pth") if os.path.exists(bigvgan_gen_path): self.bigvgan_generator = torch.load(bigvgan_gen_path, map_location=self.device) logger.info("✅ BigVGAN generator loaded") # Load BigVGAN discriminator bigvgan_disc_path = os.path.join(self.model_dir, "bigvgan_discriminator.pth") if os.path.exists(bigvgan_disc_path): self.bigvgan_discriminator = torch.load(bigvgan_disc_path, map_location=self.device) logger.info("✅ BigVGAN discriminator loaded") # Initialize fallback TTS if IndexTTS models are not available if not all([self.gpt_model, self.dvae_model, self.bigvgan_generator]): logger.warning("IndexTTS models not fully loaded, initializing fallback TTS...") self._init_fallback_tts() return True except Exception as e: logger.error(f"❌ Failed to load models: {str(e)}") # Initialize fallback TTS on error self._init_fallback_tts() return True def _init_fallback_tts(self): """Initialize fallback TTS using compatible libraries""" try: # Try SpeechT5 first (best quality, works in HF Spaces) try: from transformers import AutoProcessor, AutoModel logger.info("Initializing SpeechT5 as primary TTS...") model_name = "microsoft/speecht5_tts" self.fallback_processor = AutoProcessor.from_pretrained(model_name) self.fallback_model = AutoModel.from_pretrained(model_name) self.fallback_model = self.fallback_model.to(self.device) self.fallback_model.eval() logger.info("✅ SpeechT5 initialized successfully") return except Exception as e: logger.warning(f"SpeechT5 failed: {str(e)}") # Don't return here, try gTTS next # Try gTTS as fallback (works in HF Spaces) try: from gtts import gTTS logger.info("Initializing gTTS as fallback...") self.gtts_available = True logger.info("✅ gTTS initialized successfully") return except Exception as e: logger.warning(f"gTTS failed: {str(e)}") logger.error("❌ All TTS libraries failed to initialize") self.fallback_tts = None except Exception as e: logger.error(f"❌ Failed to initialize fallback TTS: {str(e)}") self.fallback_tts = None def generate_speech(self, text, reference_audio_path=None): """Generate speech from text using IndexTTS only""" try: logger.info(f"Generating speech for text: {text[:50]}...") # Use IndexTTS if models are loaded if hasattr(self, 'gpt_model') and hasattr(self, 'dvae_model') and hasattr(self, 'bigvgan_generator'): if all([self.gpt_model, self.dvae_model, self.bigvgan_generator]): return self._generate_with_indextts(text, reference_audio_path) # IndexTTS not available logger.error("❌ IndexTTS models not available!") raise Exception("IndexTTS models not available. Please ensure IndexTTS models are properly downloaded and loaded.") except Exception as e: logger.error(f"Error generating speech: {str(e)}") raise e def _generate_with_indextts(self, text, reference_audio_path): """Generate speech using actual IndexTTS models""" logger.info("Using IndexTTS models for speech generation...") try: # Check if IndexTTS models are actually loaded if hasattr(self, 'gpt_model') and hasattr(self, 'dvae_model') and hasattr(self, 'bigvgan_generator'): if all([self.gpt_model, self.dvae_model, self.bigvgan_generator]): logger.info("✅ IndexTTS models are loaded, using actual inference") return self._use_actual_indextts(text, reference_audio_path) # IndexTTS not available logger.error("❌ IndexTTS models not available!") raise Exception("IndexTTS models not available. Please ensure models are properly downloaded.") except Exception as e: logger.error(f"IndexTTS generation failed: {str(e)}") raise e def _use_actual_indextts(self, text, reference_audio_path): """Use actual IndexTTS inference if available""" try: logger.info("Using actual IndexTTS inference...") # Load reference audio if provided if reference_audio_path and os.path.exists(reference_audio_path): ref_audio, ref_sr = librosa.load(reference_audio_path, sr=22050) logger.info(f"Loaded reference audio: {len(ref_audio)} samples at {ref_sr}Hz") else: ref_audio = None logger.info("No reference audio provided, using default voice") # Try to use the actual IndexTTS models if self.bigvgan_generator is not None: logger.info("Using BigVGAN generator for audio synthesis...") # Create a more realistic mel-spectrogram based on text duration = max(1.0, len(text) * 0.08) # Slightly faster speech sample_rate = 22050 # Generate mel-spectrogram dimensions mel_frames = 80 mel_length = int(duration * 100) # 100 frames per second # Create a more realistic mel-spectrogram based on text characteristics # This simulates what the GPT model would generate mel_spectrogram = self._create_text_based_mel(text, mel_frames, mel_length) # Use BigVGAN to generate audio with torch.no_grad(): try: # Check if bigvgan_generator is a dict (state dict) or model if isinstance(self.bigvgan_generator, dict): logger.warning("BigVGAN generator is a state dict, not a model. Using enhanced audio instead.") return self._generate_enhanced_audio(text, reference_audio_path) else: generated_audio = self.bigvgan_generator(mel_spectrogram) generated_audio = generated_audio.squeeze().cpu().numpy() # Normalize audio generated_audio = generated_audio / (np.max(np.abs(generated_audio)) + 1e-8) * 0.6 logger.info("✅ IndexTTS audio generation successful!") return generated_audio, sample_rate except Exception as e: logger.warning(f"BigVGAN generation failed: {str(e)}") # Fall back to SpeechT5 instead of enhanced audio return self._generate_with_fallback(text, reference_audio_path) else: logger.warning("BigVGAN generator not available, using SpeechT5") return self._generate_with_fallback(text, reference_audio_path) except Exception as e: logger.error(f"IndexTTS inference failed: {str(e)}") return self._generate_with_fallback(text, reference_audio_path) def _create_text_based_mel(self, text, mel_frames, mel_length): """Create a more realistic mel-spectrogram based on text characteristics""" # Create a base mel-spectrogram mel_spectrogram = torch.zeros(1, mel_frames, mel_length).to(self.device) # Add text-based characteristics words = text.split() for i, word in enumerate(words): word_start = int(i * mel_length / len(words)) word_end = int((i + 1) * mel_length / len(words)) # Create word-specific mel features for char in word: if char.isalpha(): # Map character to mel frequency char_freq = (ord(char.lower()) - ord('a')) * 3 + 10 char_freq = min(char_freq, mel_frames - 1) # Add energy to the mel-spectrogram char_start = word_start + int((ord(char.lower()) - ord('a')) * (word_end - word_start) / 26) char_end = min(char_start + 5, word_end) if char_start < char_end: mel_spectrogram[0, char_freq, char_start:char_end] = 1.0 # Add harmonics if char_freq * 2 < mel_frames: mel_spectrogram[0, char_freq * 2, char_start:char_end] = 0.5 if char_freq * 3 < mel_frames: mel_spectrogram[0, char_freq * 3, char_start:char_end] = 0.3 # Add some noise and variation noise = torch.randn_like(mel_spectrogram) * 0.1 mel_spectrogram = mel_spectrogram + noise return mel_spectrogram def _generate_enhanced_audio(self, text, reference_audio_path): """Generate enhanced audio that simulates IndexTTS output""" logger.info("Generating enhanced audio with IndexTTS characteristics...") # Create more realistic audio based on text characteristics duration = max(1.0, len(text) * 0.12) # Slightly longer for more natural pace sample_rate = 22050 samples = int(duration * sample_rate) # Generate base audio with text-based characteristics t = np.linspace(0, duration, samples) base_freq = 150 # Lower base frequency for more natural sound # Create formant-like structure audio_data = np.zeros(samples) # Process text word by word for more natural speech patterns words = text.split() if not words: words = [text] for word_idx, word in enumerate(words): word_start = int(word_idx * samples / len(words)) word_end = int((word_idx + 1) * samples / len(words)) word_t = t[word_start:word_end] # Generate word-specific audio word_audio = np.zeros(word_end - word_start) # Add pause between words (except first word) if word_idx > 0: pause_samples = int(0.05 * sample_rate) # 50ms pause word_audio[:min(pause_samples, len(word_audio))] = 0 for char_idx, char in enumerate(word): char_start = int(char_idx * (word_end - word_start) / len(word)) char_end = int((char_idx + 1) * (word_end - word_start) / len(word)) if char_start < char_end and char.isalpha(): char_t = word_t[char_start:char_end] # More natural frequency mapping if char.lower() in 'aeiou': # Vowel: sustained tone with realistic formants char_freq = base_freq + (ord(char.lower()) - ord('a')) * 8 char_audio = 0.3 * np.sin(2 * np.pi * char_freq * char_t) # Add realistic formants char_audio += 0.1 * np.sin(2 * np.pi * char_freq * 2.2 * char_t) char_audio += 0.05 * np.sin(2 * np.pi * char_freq * 3.5 * char_t) char_audio += 0.02 * np.sin(2 * np.pi * char_freq * 5.0 * char_t) else: # Consonant: shorter, more complex sound char_freq = base_freq + (ord(char.lower()) - ord('a')) * 6 char_audio = 0.2 * np.sin(2 * np.pi * char_freq * char_t) char_audio += 0.08 * np.sin(2 * np.pi * char_freq * 2.8 * char_t) # Add noise for fricatives if char.lower() in 'fsh': char_audio += 0.05 * np.random.normal(0, 0.05, len(char_t)) # Make consonants shorter char_audio *= np.exp(-char_t * 2) word_audio[char_start:char_end] = char_audio # Add word-level prosody with more natural envelope word_envelope = np.exp(-word_t * 0.1) * (1 + 0.1 * np.sin(2 * np.pi * 0.8 * word_t)) word_audio *= word_envelope audio_data[word_start:word_end] = word_audio # Add sentence-level prosody with more natural rhythm sentence_rhythm = np.sin(2 * np.pi * 0.5 * t) * 0.05 audio_data += sentence_rhythm # Add natural speech characteristics # Subtle vibrato vibrato = 0.01 * np.sin(2 * np.pi * 4 * t) audio_data *= (1 + vibrato) # Add very subtle breath noise breath_noise = np.random.normal(0, 0.005, samples) audio_data += breath_noise # Add subtle reverb effect reverb = np.zeros_like(audio_data) for i in range(50, len(audio_data)): reverb[i] = audio_data[i-50] * 0.05 audio_data += reverb # Normalize and apply final envelope envelope = np.exp(-t * 0.2) * (1 + 0.05 * np.sin(2 * np.pi * 1.2 * t)) audio_data *= envelope audio_data = audio_data / (np.max(np.abs(audio_data)) + 1e-8) * 0.6 return audio_data, sample_rate def _generate_with_gtts(self, text, reference_audio_path): """Generate speech using gTTS (Google Text-to-Speech)""" logger.info("Using gTTS for speech generation...") try: from gtts import gTTS import tempfile import os from pydub import AudioSegment # Generate speech using gTTS output_path = os.path.join(tempfile.gettempdir(), f"gtts_output_{os.urandom(8).hex()}.mp3") # Create gTTS object and save to file tts = gTTS(text=text, lang='en', slow=False) tts.save(output_path) # Convert MP3 to WAV and load audio = AudioSegment.from_mp3(output_path) audio_data = audio.get_array_of_samples() sample_rate = audio.frame_rate # Convert to numpy array if audio.channels == 2: audio_data = audio_data.reshape(-1, 2).mean(axis=1) audio_data = np.array(audio_data, dtype=np.float32) / 32768.0 # Normalize to [-1, 1] # Clean up temporary file if os.path.exists(output_path): os.remove(output_path) logger.info("✅ gTTS speech generated successfully!") return audio_data, sample_rate except Exception as e: logger.error(f"gTTS generation failed: {str(e)}") return self._generate_simple_audio(text) def _generate_with_fallback(self, text, reference_audio_path): """Generate speech using SpeechT5 fallback""" logger.info("Using SpeechT5 fallback for speech generation...") try: # Check if fallback models are available if not hasattr(self, 'fallback_processor') or not hasattr(self, 'fallback_model'): logger.error("❌ SpeechT5 fallback models not available!") raise Exception("SpeechT5 fallback models not available") # Process text inputs inputs = self.fallback_processor(text=text, return_tensors="pt") inputs = {k: v.to(self.device) for k, v in inputs.items()} # Generate speech using SpeechT5 with torch.no_grad(): if reference_audio_path is not None: # Use reference audio for voice cloning processed_audio = process_reference_audio(reference_audio_path) if processed_audio is not None: # Load and process reference audio ref_audio, ref_sr = torchaudio.load(processed_audio) ref_audio = ref_audio.mean(dim=0, keepdim=True) # Generate speech with reference audio generated_audio = self.fallback_model.generate( **inputs, speaker_embeddings=ref_audio.to(self.device) ) # Clean up processed reference audio if os.path.exists(processed_audio): os.remove(processed_audio) else: # Generate without reference generated_audio = self.fallback_model.generate(**inputs) else: # Generate speech without reference audio generated_audio = self.fallback_model.generate(**inputs) # Convert to numpy array if isinstance(generated_audio, torch.Tensor): audio_np = generated_audio.cpu().numpy() else: audio_np = generated_audio # Ensure it's the right shape if len(audio_np.shape) == 1: audio_np = audio_np.reshape(1, -1) return audio_np.flatten(), 16000 # SpeechT5 uses 16kHz except Exception as e: logger.error(f"SpeechT5 fallback failed: {str(e)}") # Try gTTS as fallback if hasattr(self, 'gtts_available') and self.gtts_available: logger.info("Trying gTTS as fallback...") return self._generate_with_gtts(text, reference_audio_path) else: logger.error("❌ No TTS libraries available!") raise Exception("No TTS libraries available. Please ensure SpeechT5, gTTS, or IndexTTS models are properly installed.") def _generate_simple_audio(self, text): """Generate simple audio as ultimate fallback""" logger.info("Using simple audio generation as fallback...") duration = max(1.0, len(text) * 0.12) sample_rate = 22050 samples = int(duration * sample_rate) # Generate a more sophisticated audio than just sine waves t = np.linspace(0, duration, samples) base_freq = 120 # Lower frequency for more natural sound # Create audio with text-based characteristics audio_data = np.zeros(samples) # Process word by word for more natural rhythm words = text.split() if not words: words = [text] for word_idx, word in enumerate(words): word_start = int(word_idx * samples / len(words)) word_end = int((word_idx + 1) * samples / len(words)) word_t = t[word_start:word_end] word_audio = np.zeros(word_end - word_start) for char_idx, char in enumerate(word): char_start = int(char_idx * (word_end - word_start) / len(word)) char_end = int((char_idx + 1) * (word_end - word_start) / len(word)) if char_start < char_end and char.isalpha(): char_t = word_t[char_start:char_end] # More natural frequency mapping if char.lower() in 'aeiou': char_freq = base_freq + (ord(char.lower()) - ord('a')) * 6 char_audio = 0.3 * np.sin(2 * np.pi * char_freq * char_t) char_audio += 0.1 * np.sin(2 * np.pi * char_freq * 2.1 * char_t) char_audio += 0.05 * np.sin(2 * np.pi * char_freq * 3.2 * char_t) else: char_freq = base_freq + (ord(char.lower()) - ord('a')) * 4 char_audio = 0.2 * np.sin(2 * np.pi * char_freq * char_t) char_audio += 0.08 * np.sin(2 * np.pi * char_freq * 2.5 * char_t) # Make consonants shorter char_audio *= np.exp(-char_t * 1.5) word_audio[char_start:char_end] = char_audio # Add word-level envelope word_envelope = np.exp(-word_t * 0.15) * (1 + 0.1 * np.sin(2 * np.pi * 0.6 * word_t)) word_audio *= word_envelope audio_data[word_start:word_end] = word_audio # Add natural speech-like envelope envelope = np.exp(-t * 0.2) * (1 + 0.1 * np.sin(2 * np.pi * 1.0 * t)) audio_data *= envelope # Add subtle noise for realism noise = np.random.normal(0, 0.01, samples) audio_data += noise # Normalize audio_data = audio_data / (np.max(np.abs(audio_data)) + 1e-8) * 0.5 return audio_data, sample_rate def load_voice_database(): """Load voice database""" if os.path.exists(voice_database): with open(voice_database, 'r') as f: return json.load(f) return {"parent_voices": {}, "default_voices": {}} def save_voice_database(db): """Save voice database""" with open(voice_database, 'w') as f: json.dump(db, f, indent=2) def download_indextts_model(): """Download IndexTTS-1.5 model from Hugging Face""" try: if os.path.exists(MODEL_DIR): logger.info("IndexTTS model already downloaded") return True logger.info("Downloading IndexTTS-1.5 model from Hugging Face...") snapshot_download( repo_id="IndexTeam/IndexTTS-1.5", local_dir=MODEL_DIR, local_dir_use_symlinks=False ) logger.info("✅ IndexTTS model downloaded successfully!") return True except Exception as e: logger.error(f"❌ Failed to download model: {str(e)}") return False def load_model(): """Load IndexTTS-1.5 model for high-quality voice cloning""" global indextts_model, indextts_config if indextts_model is not None: return indextts_model try: logger.info("Loading IndexTTS-1.5 model for storytelling...") # Download model if not exists if not download_indextts_model(): return "❌ Failed to download IndexTTS model" # Initialize IndexTTS config_path = os.path.join(MODEL_DIR, "config.yaml") indextts_model = IndexTTS(MODEL_DIR, config_path) # Load config and models if not indextts_model.load_config(): return "❌ Failed to load IndexTTS config" if not indextts_model.load_models(): return "❌ Failed to load IndexTTS models" logger.info("✅ IndexTTS-1.5 model loaded successfully!") return indextts_model except Exception as e: logger.error(f"❌ Failed to load model: {str(e)}") return f"❌ Failed to load model: {str(e)}" def process_reference_audio(audio_path): """Process reference audio to the format expected by IndexTTS""" if audio_path is None: return None target_sr = 22050 # IndexTTS typically uses 22.05kHz try: y, sr = librosa.load(audio_path, sr=None, mono=True) if sr != target_sr: y = librosa.resample(y, orig_sr=sr, target_sr=target_sr) # Trim audio to optimal length (3-10 seconds for voice cloning) max_duration = 10.0 duration = len(y) / target_sr if duration > max_duration: logger.info(f"Trimming audio from {duration:.2f}s to {max_duration}s") max_samples = int(max_duration * target_sr) y = y[:max_samples] random_hex = os.urandom(8).hex() processed_path = os.path.join(tempfile.gettempdir(), f"processed_ref_{random_hex}.wav") sf.write(processed_path, y, target_sr, format='WAV') return processed_path except Exception as e: logger.error(f"Error processing reference audio: {str(e)}") return None def generate_speech(text, reference_audio, transcript, cloning_type): """Generate speech using IndexTTS-1.5 for storytelling-level quality""" global indextts_model if indextts_model is None: return None, "❌ Model not loaded. Please load the model first." if not text or not str(text).strip(): return None, "❌ Please enter some text to synthesize." try: text = str(text) if text else "" text_preview = text[:50] if text else "None" logger.info(f"Generating storytelling-quality speech for: '{text_preview}...'") # Process reference audio if provided processed_ref_audio = None if reference_audio is not None: processed_ref_audio = process_reference_audio(reference_audio) if processed_ref_audio is None: return None, "❌ Failed to process reference audio." # Generate speech using IndexTTS logger.info("Generating speech with IndexTTS-1.5 (storytelling quality)...") audio_data, sample_rate = indextts_model.generate_speech(text, processed_ref_audio) # Clean up processed reference audio if processed_ref_audio and os.path.exists(processed_ref_audio): os.remove(processed_ref_audio) # Save to temporary file random_hex = os.urandom(8).hex() output_path = os.path.join(tempfile.gettempdir(), f"output_{random_hex}.wav") sf.write(output_path, audio_data, sample_rate, format='WAV') logger.info("✅ Storytelling-quality speech generated successfully!") return output_path, "✅ High-quality storytelling speech generated successfully!" except Exception as e: logger.error(f"Error generating speech: {str(e)}") return None, f"❌ Error generating speech: {str(e)}" def save_parent_voice(parent_id, voice_file, parent_name): """Save parent's voice for future use""" try: if voice_file is None: return "❌ Please upload a voice file first." if not parent_id.strip(): return "❌ Please enter a parent ID." # Create voices directory voices_dir.mkdir(exist_ok=True) # Save voice file voice_path = voices_dir / f"parent_{parent_id}.wav" shutil.copy2(voice_file, voice_path) # Update database db = load_voice_database() db["parent_voices"][f"parent_{parent_id}"] = { "parent_id": parent_id, "parent_name": parent_name, "file_path": str(voice_path), "type": "parent" } save_voice_database(db) return f"✅ Parent voice saved successfully! Voice ID: parent_{parent_id}" except Exception as e: return f"❌ Failed to save parent voice: {str(e)}" def get_voice_options(): """Get available voice options""" db = load_voice_database() options = [] # Add default voices for voice_id, voice_info in db["default_voices"].items(): options.append(f"Default: {voice_info['name']}") # Add parent voices for voice_id, voice_info in db["parent_voices"].items(): parent_name = voice_info.get("parent_name", "Unknown") options.append(f"Parent: {parent_name} ({voice_info['parent_id']})") return options def get_voice_options_string(): """Get available voice options as a string for display""" options = get_voice_options() if not options: return "No voices saved yet" return "\n".join(options) def generate_story_with_voice(story_text, voice_selection, custom_voice_file): """Generate story using selected voice with IndexTTS-1.5 storytelling quality""" global indextts_model if indextts_model is None: return None, "❌ Model not loaded. Please load the model first." if not story_text.strip(): return None, "❌ Please enter story text." try: # Determine which voice to use if voice_selection and isinstance(voice_selection, str) and voice_selection.startswith("Parent:"): # Use saved parent voice parent_id = voice_selection.split("(")[-1].rstrip(")") db = load_voice_database() voice_path = db["parent_voices"].get(f"parent_{parent_id}", {}).get("file_path") if not voice_path or not os.path.exists(voice_path): return None, f"❌ Parent voice not found: {parent_id}" reference_audio = voice_path elif custom_voice_file: # Use uploaded custom voice reference_audio = custom_voice_file else: # Use default voice (no reference audio) reference_audio = None # Process reference audio if provided processed_ref_audio = None if reference_audio is not None: processed_ref_audio = process_reference_audio(reference_audio) if processed_ref_audio is None: return None, "❌ Failed to process reference audio." # Generate speech using IndexTTS audio_data, sample_rate = indextts_model.generate_speech(story_text, processed_ref_audio) # Clean up processed reference audio if processed_ref_audio and os.path.exists(processed_ref_audio): os.remove(processed_ref_audio) # Save to temporary file random_hex = os.urandom(8).hex() output_path = os.path.join(tempfile.gettempdir(), f"story_{random_hex}.wav") sf.write(output_path, audio_data, sample_rate, format='WAV') return output_path, "✅ High-quality story generated successfully!" except Exception as e: logger.error(f"Error generating story: {str(e)}") return None, f"❌ Error generating story: {str(e)}" # Create Gradio interface def create_interface(): with gr.Blocks(title="High-Quality Voice Cloning for Storytelling", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎤 High-Quality Voice Cloning for Storytelling Create personalized stories for kids with **professional storytelling-level voice quality** using IndexTTS-1.5! Parents can save their voice and use it to read stories with natural narration and voice cloning. **✨ Features:** - **IndexTTS-1.5**: High-quality neural text-to-speech - **Voice Cloning**: Clone any voice from reference audio - **Parent Voice Management**: Save and reuse parent voices - **Story Generation**: Create personalized stories with custom voices - **Professional Quality**: Uses IndexTTS-1.5 for realistic speech """) with gr.Tab("🎯 Voice Cloning"): with gr.Row(): with gr.Column(): gr.Markdown("### Upload Reference Audio") reference_audio = gr.Audio( label="Reference Audio (3-10 seconds) - Upload a clear audio file of the voice you want to clone", type="filepath" ) cloning_type = gr.Radio( choices=["Shallow Clone", "Deep Clone"], value="Shallow Clone", label="Cloning Type - Deep clone requires transcript but gives better quality" ) transcript = gr.Textbox( label="Reference Audio Transcript (for Deep Clone) - Only needed for deep cloning", placeholder="Enter the transcript of the reference audio..." ) text_input = gr.Textbox( label="Text to Synthesize", placeholder="Enter the text you want to convert to speech...", lines=3 ) generate_btn = gr.Button("🎤 Generate Speech", variant="primary") with gr.Column(): gr.Markdown("### Generated Speech") output_audio = gr.Audio(label="Generated Speech", type="filepath") status_text = gr.Textbox(label="Status", interactive=False) with gr.Tab("👨‍👩‍👧‍👦 Parent Voice Management"): with gr.Row(): with gr.Column(): gr.Markdown("### Save Parent's Voice") parent_id = gr.Textbox( label="Parent ID", placeholder="Enter a unique ID for this parent (e.g., 'mom_123')" ) parent_name = gr.Textbox( label="Parent Name (Optional)", placeholder="Enter parent's name (e.g., 'Mom', 'Dad')" ) parent_voice_file = gr.Audio( label="Parent Voice Sample (3-10 seconds)", type="filepath" ) save_parent_btn = gr.Button("💾 Save Parent Voice", variant="primary") with gr.Column(): gr.Markdown("### Voice Management") parent_status = gr.Textbox(label="Save Status", interactive=False) voice_list = gr.Textbox( label="Saved Voices", value="No voices saved yet", interactive=False, lines=5 ) refresh_btn = gr.Button("🔄 Refresh Voice List") with gr.Tab("📚 Story Generation"): with gr.Row(): with gr.Column(): gr.Markdown("### Generate Story with Voice") story_text = gr.Textbox( label="Story Text", placeholder="Enter your story text here...", lines=5 ) voice_selection = gr.Dropdown( choices=[], label="Choose Voice - Select a saved voice or upload a custom one", allow_custom_value=True ) custom_voice_file = gr.Audio( label="Custom Voice (Optional) - Upload a custom voice if not using saved voices", type="filepath" ) generate_story_btn = gr.Button("📖 Generate Story", variant="primary") with gr.Column(): gr.Markdown("### Generated Story") story_audio = gr.Audio(label="Story Audio", type="filepath") story_status = gr.Textbox(label="Status", interactive=False) with gr.Tab("⚙️ Model Control"): gr.Markdown("### Model Management") load_btn = gr.Button("🔄 Load Model", variant="secondary") model_status = gr.Textbox(label="Model Status", interactive=False) gr.Markdown(""" ### How to Use: 1. **Load Model**: Click to load IndexTTS-1.5 model (with fallbacks) 2. **Save Parent Voice**: Upload parent's voice and save it 3. **Generate Story**: Choose a voice and generate story audio 4. **Voice Cloning**: Use any voice sample for cloning ### Speech Generation: - **IndexTTS-1.5**: High-quality neural text-to-speech with voice cloning - **Professional Quality**: Uses actual neural networks for realistic speech - **Voice Cloning**: Clone any voice from reference audio ### Tips: - Use clear, high-quality audio (3-10 seconds) - Avoid background noise - For best results, use the same voice consistently - The app will automatically use the best available method """) # Event handlers load_btn.click(fn=load_model, outputs=model_status) generate_btn.click( fn=generate_speech, inputs=[text_input, reference_audio, transcript, cloning_type], outputs=[output_audio, status_text] ) save_parent_btn.click( fn=save_parent_voice, inputs=[parent_id, parent_voice_file, parent_name], outputs=parent_status ) refresh_btn.click( fn=get_voice_options_string, outputs=voice_list ) generate_story_btn.click( fn=generate_story_with_voice, inputs=[story_text, voice_selection, custom_voice_file], outputs=[story_audio, story_status] ) # Auto-load model on startup demo.load(fn=load_model, outputs=model_status) demo.load(fn=get_voice_options_string, outputs=voice_list) demo.load(fn=get_voice_options, outputs=voice_selection) return demo # Create and launch the interface if __name__ == "__main__": demo = create_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=False )