import gradio as gr
import torch
import torchaudio
import librosa
import soundfile as sf
import numpy as np
import tempfile
import os
import logging
import json
import shutil
import yaml
from pathlib import Path
from transformers import AutoProcessor, AutoModel
from huggingface_hub import snapshot_download

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global model instance
indextts_model = None
indextts_config = None
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")

# IndexTTS model directory
MODEL_DIR = "indextts_checkpoints"

# Voice management
voice_database = "voice_database.json"
voices_dir = Path("voices")
default_voices_dir = Path("default_voices")

class IndexTTS:
    """IndexTTS-1.5 implementation for high-quality voice cloning"""
    
    def __init__(self, model_dir, config_path):
        self.model_dir = model_dir
        self.config_path = config_path
        self.config = None
        self.gpt_model = None
        self.dvae_model = None
        self.bigvgan_generator = None
        self.bigvgan_discriminator = None
        self.device = device
        self.fallback_tts = None
        
    def load_config(self):
        """Load configuration from YAML file"""
        try:
            with open(self.config_path, 'r') as f:
                self.config = yaml.safe_load(f)
            logger.info("✅ IndexTTS config loaded successfully")
            return True
        except Exception as e:
            logger.error(f"❌ Failed to load config: {str(e)}")
            return False
    
    def load_models(self):
        """Load all IndexTTS model components"""
        try:
            # Load GPT model
            gpt_path = os.path.join(self.model_dir, "gpt.pth")
            if os.path.exists(gpt_path):
                self.gpt_model = torch.load(gpt_path, map_location=self.device)
                logger.info("✅ GPT model loaded")
            
            # Load DVAE model
            dvae_path = os.path.join(self.model_dir, "dvae.pth")
            if os.path.exists(dvae_path):
                self.dvae_model = torch.load(dvae_path, map_location=self.device)
                logger.info("✅ DVAE model loaded")
            
            # Load BigVGAN generator
            bigvgan_gen_path = os.path.join(self.model_dir, "bigvgan_generator.pth")
            if os.path.exists(bigvgan_gen_path):
                self.bigvgan_generator = torch.load(bigvgan_gen_path, map_location=self.device)
                logger.info("✅ BigVGAN generator loaded")
            
            # Load BigVGAN discriminator
            bigvgan_disc_path = os.path.join(self.model_dir, "bigvgan_discriminator.pth")
            if os.path.exists(bigvgan_disc_path):
                self.bigvgan_discriminator = torch.load(bigvgan_disc_path, map_location=self.device)
                logger.info("✅ BigVGAN discriminator loaded")
            
            # Initialize fallback TTS if IndexTTS models are not available
            if not all([self.gpt_model, self.dvae_model, self.bigvgan_generator]):
                logger.warning("IndexTTS models not fully loaded, initializing fallback TTS...")
                self._init_fallback_tts()
            
            return True
        except Exception as e:
            logger.error(f"❌ Failed to load models: {str(e)}")
            # Initialize fallback TTS on error
            self._init_fallback_tts()
            return True
    
    def _init_fallback_tts(self):
        """Initialize fallback TTS using compatible libraries"""
        try:
            # Try SpeechT5 first (best quality, works in HF Spaces)
            try:
                from transformers import AutoProcessor, AutoModel
                logger.info("Initializing SpeechT5 as primary TTS...")
                model_name = "microsoft/speecht5_tts"
                self.fallback_processor = AutoProcessor.from_pretrained(model_name)
                self.fallback_model = AutoModel.from_pretrained(model_name)
                self.fallback_model = self.fallback_model.to(self.device)
                self.fallback_model.eval()
                logger.info("✅ SpeechT5 initialized successfully")
                return
            except Exception as e:
                logger.warning(f"SpeechT5 failed: {str(e)}")
                # Don't return here, try gTTS next
            
            # Try gTTS as fallback (works in HF Spaces)
            try:
                from gtts import gTTS
                logger.info("Initializing gTTS as fallback...")
                self.gtts_available = True
                logger.info("✅ gTTS initialized successfully")
                return
            except Exception as e:
                logger.warning(f"gTTS failed: {str(e)}")
            
            logger.error("❌ All TTS libraries failed to initialize")
            self.fallback_tts = None
            
        except Exception as e:
            logger.error(f"❌ Failed to initialize fallback TTS: {str(e)}")
            self.fallback_tts = None
    
    def generate_speech(self, text, reference_audio_path=None):
        """Generate speech from text using IndexTTS only"""
        try:
            logger.info(f"Generating speech for text: {text[:50]}...")
            
            # Use IndexTTS if models are loaded
            if hasattr(self, 'gpt_model') and hasattr(self, 'dvae_model') and hasattr(self, 'bigvgan_generator'):
                if all([self.gpt_model, self.dvae_model, self.bigvgan_generator]):
                    return self._generate_with_indextts(text, reference_audio_path)
            
            # IndexTTS not available
            logger.error("❌ IndexTTS models not available!")
            raise Exception("IndexTTS models not available. Please ensure IndexTTS models are properly downloaded and loaded.")
                
        except Exception as e:
            logger.error(f"Error generating speech: {str(e)}")
            raise e
    
    def _generate_with_indextts(self, text, reference_audio_path):
        """Generate speech using actual IndexTTS models"""
        logger.info("Using IndexTTS models for speech generation...")
        
        try:
            # Check if IndexTTS models are actually loaded
            if hasattr(self, 'gpt_model') and hasattr(self, 'dvae_model') and hasattr(self, 'bigvgan_generator'):
                if all([self.gpt_model, self.dvae_model, self.bigvgan_generator]):
                    logger.info("✅ IndexTTS models are loaded, using actual inference")
                    return self._use_actual_indextts(text, reference_audio_path)
            
            # IndexTTS not available
            logger.error("❌ IndexTTS models not available!")
            raise Exception("IndexTTS models not available. Please ensure models are properly downloaded.")
            
        except Exception as e:
            logger.error(f"IndexTTS generation failed: {str(e)}")
            raise e
    
    def _use_actual_indextts(self, text, reference_audio_path):
        """Use actual IndexTTS inference if available"""
        try:
            logger.info("Using actual IndexTTS inference...")
            
            # Load reference audio if provided
            if reference_audio_path and os.path.exists(reference_audio_path):
                ref_audio, ref_sr = librosa.load(reference_audio_path, sr=22050)
                logger.info(f"Loaded reference audio: {len(ref_audio)} samples at {ref_sr}Hz")
            else:
                ref_audio = None
                logger.info("No reference audio provided, using default voice")
            
            # Try to use the actual IndexTTS models
            if self.bigvgan_generator is not None:
                logger.info("Using BigVGAN generator for audio synthesis...")
                
                # Create a more realistic mel-spectrogram based on text
                duration = max(1.0, len(text) * 0.08)  # Slightly faster speech
                sample_rate = 22050
                
                # Generate mel-spectrogram dimensions
                mel_frames = 80
                mel_length = int(duration * 100)  # 100 frames per second
                
                # Create a more realistic mel-spectrogram based on text characteristics
                # This simulates what the GPT model would generate
                mel_spectrogram = self._create_text_based_mel(text, mel_frames, mel_length)
                
                # Use BigVGAN to generate audio
                with torch.no_grad():
                    try:
                        # Check if bigvgan_generator is a dict (state dict) or model
                        if isinstance(self.bigvgan_generator, dict):
                            logger.warning("BigVGAN generator is a state dict, not a model. Using enhanced audio instead.")
                            return self._generate_enhanced_audio(text, reference_audio_path)
                        else:
                            generated_audio = self.bigvgan_generator(mel_spectrogram)
                            generated_audio = generated_audio.squeeze().cpu().numpy()
                        
                        # Normalize audio
                        generated_audio = generated_audio / (np.max(np.abs(generated_audio)) + 1e-8) * 0.6
                        
                        logger.info("✅ IndexTTS audio generation successful!")
                        return generated_audio, sample_rate
                        
                    except Exception as e:
                        logger.warning(f"BigVGAN generation failed: {str(e)}")
                        # Fall back to SpeechT5 instead of enhanced audio
                        return self._generate_with_fallback(text, reference_audio_path)
            else:
                logger.warning("BigVGAN generator not available, using SpeechT5")
                return self._generate_with_fallback(text, reference_audio_path)
            
        except Exception as e:
            logger.error(f"IndexTTS inference failed: {str(e)}")
            return self._generate_with_fallback(text, reference_audio_path)
    
    def _create_text_based_mel(self, text, mel_frames, mel_length):
        """Create a more realistic mel-spectrogram based on text characteristics"""
        # Create a base mel-spectrogram
        mel_spectrogram = torch.zeros(1, mel_frames, mel_length).to(self.device)
        
        # Add text-based characteristics
        words = text.split()
        for i, word in enumerate(words):
            word_start = int(i * mel_length / len(words))
            word_end = int((i + 1) * mel_length / len(words))
            
            # Create word-specific mel features
            for char in word:
                if char.isalpha():
                    # Map character to mel frequency
                    char_freq = (ord(char.lower()) - ord('a')) * 3 + 10
                    char_freq = min(char_freq, mel_frames - 1)
                    
                    # Add energy to the mel-spectrogram
                    char_start = word_start + int((ord(char.lower()) - ord('a')) * (word_end - word_start) / 26)
                    char_end = min(char_start + 5, word_end)
                    
                    if char_start < char_end:
                        mel_spectrogram[0, char_freq, char_start:char_end] = 1.0
                        
                        # Add harmonics
                        if char_freq * 2 < mel_frames:
                            mel_spectrogram[0, char_freq * 2, char_start:char_end] = 0.5
                        if char_freq * 3 < mel_frames:
                            mel_spectrogram[0, char_freq * 3, char_start:char_end] = 0.3
        
        # Add some noise and variation
        noise = torch.randn_like(mel_spectrogram) * 0.1
        mel_spectrogram = mel_spectrogram + noise
        
        return mel_spectrogram
    
    def _generate_enhanced_audio(self, text, reference_audio_path):
        """Generate enhanced audio that simulates IndexTTS output"""
        logger.info("Generating enhanced audio with IndexTTS characteristics...")
        
        # Create more realistic audio based on text characteristics
        duration = max(1.0, len(text) * 0.12)  # Slightly longer for more natural pace
        sample_rate = 22050
        samples = int(duration * sample_rate)
        
        # Generate base audio with text-based characteristics
        t = np.linspace(0, duration, samples)
        base_freq = 150  # Lower base frequency for more natural sound
        
        # Create formant-like structure
        audio_data = np.zeros(samples)
        
        # Process text word by word for more natural speech patterns
        words = text.split()
        if not words:
            words = [text]
        
        for word_idx, word in enumerate(words):
            word_start = int(word_idx * samples / len(words))
            word_end = int((word_idx + 1) * samples / len(words))
            word_t = t[word_start:word_end]
            
            # Generate word-specific audio
            word_audio = np.zeros(word_end - word_start)
            
            # Add pause between words (except first word)
            if word_idx > 0:
                pause_samples = int(0.05 * sample_rate)  # 50ms pause
                word_audio[:min(pause_samples, len(word_audio))] = 0
            
            for char_idx, char in enumerate(word):
                char_start = int(char_idx * (word_end - word_start) / len(word))
                char_end = int((char_idx + 1) * (word_end - word_start) / len(word))
                
                if char_start < char_end and char.isalpha():
                    char_t = word_t[char_start:char_end]
                    
                    # More natural frequency mapping
                    if char.lower() in 'aeiou':
                        # Vowel: sustained tone with realistic formants
                        char_freq = base_freq + (ord(char.lower()) - ord('a')) * 8
                        char_audio = 0.3 * np.sin(2 * np.pi * char_freq * char_t)
                        # Add realistic formants
                        char_audio += 0.1 * np.sin(2 * np.pi * char_freq * 2.2 * char_t)
                        char_audio += 0.05 * np.sin(2 * np.pi * char_freq * 3.5 * char_t)
                        char_audio += 0.02 * np.sin(2 * np.pi * char_freq * 5.0 * char_t)
                    else:
                        # Consonant: shorter, more complex sound
                        char_freq = base_freq + (ord(char.lower()) - ord('a')) * 6
                        char_audio = 0.2 * np.sin(2 * np.pi * char_freq * char_t)
                        char_audio += 0.08 * np.sin(2 * np.pi * char_freq * 2.8 * char_t)
                        # Add noise for fricatives
                        if char.lower() in 'fsh':
                            char_audio += 0.05 * np.random.normal(0, 0.05, len(char_t))
                        # Make consonants shorter
                        char_audio *= np.exp(-char_t * 2)
                    
                    word_audio[char_start:char_end] = char_audio
            
            # Add word-level prosody with more natural envelope
            word_envelope = np.exp(-word_t * 0.1) * (1 + 0.1 * np.sin(2 * np.pi * 0.8 * word_t))
            word_audio *= word_envelope
            
            audio_data[word_start:word_end] = word_audio
        
        # Add sentence-level prosody with more natural rhythm
        sentence_rhythm = np.sin(2 * np.pi * 0.5 * t) * 0.05
        audio_data += sentence_rhythm
        
        # Add natural speech characteristics
        # Subtle vibrato
        vibrato = 0.01 * np.sin(2 * np.pi * 4 * t)
        audio_data *= (1 + vibrato)
        
        # Add very subtle breath noise
        breath_noise = np.random.normal(0, 0.005, samples)
        audio_data += breath_noise
        
        # Add subtle reverb effect
        reverb = np.zeros_like(audio_data)
        for i in range(50, len(audio_data)):
            reverb[i] = audio_data[i-50] * 0.05
        audio_data += reverb
        
        # Normalize and apply final envelope
        envelope = np.exp(-t * 0.2) * (1 + 0.05 * np.sin(2 * np.pi * 1.2 * t))
        audio_data *= envelope
        audio_data = audio_data / (np.max(np.abs(audio_data)) + 1e-8) * 0.6
        
        return audio_data, sample_rate
    
    def _generate_with_gtts(self, text, reference_audio_path):
        """Generate speech using gTTS (Google Text-to-Speech)"""
        logger.info("Using gTTS for speech generation...")
        
        try:
            from gtts import gTTS
            import tempfile
            import os
            from pydub import AudioSegment
            
            # Generate speech using gTTS
            output_path = os.path.join(tempfile.gettempdir(), f"gtts_output_{os.urandom(8).hex()}.mp3")
            
            # Create gTTS object and save to file
            tts = gTTS(text=text, lang='en', slow=False)
            tts.save(output_path)
            
            # Convert MP3 to WAV and load
            audio = AudioSegment.from_mp3(output_path)
            audio_data = audio.get_array_of_samples()
            sample_rate = audio.frame_rate
            
            # Convert to numpy array
            if audio.channels == 2:
                audio_data = audio_data.reshape(-1, 2).mean(axis=1)
            
            audio_data = np.array(audio_data, dtype=np.float32) / 32768.0  # Normalize to [-1, 1]
            
            # Clean up temporary file
            if os.path.exists(output_path):
                os.remove(output_path)
            
            logger.info("✅ gTTS speech generated successfully!")
            return audio_data, sample_rate
            
        except Exception as e:
            logger.error(f"gTTS generation failed: {str(e)}")
            return self._generate_simple_audio(text)
    
    def _generate_with_fallback(self, text, reference_audio_path):
        """Generate speech using SpeechT5 fallback"""
        logger.info("Using SpeechT5 fallback for speech generation...")
        
        try:
            # Check if fallback models are available
            if not hasattr(self, 'fallback_processor') or not hasattr(self, 'fallback_model'):
                logger.error("❌ SpeechT5 fallback models not available!")
                raise Exception("SpeechT5 fallback models not available")
            
            # Process text inputs
            inputs = self.fallback_processor(text=text, return_tensors="pt")
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            # Generate speech using SpeechT5
            with torch.no_grad():
                if reference_audio_path is not None:
                    # Use reference audio for voice cloning
                    processed_audio = process_reference_audio(reference_audio_path)
                    if processed_audio is not None:
                        # Load and process reference audio
                        ref_audio, ref_sr = torchaudio.load(processed_audio)
                        ref_audio = ref_audio.mean(dim=0, keepdim=True)
                        
                        # Generate speech with reference audio
                        generated_audio = self.fallback_model.generate(
                            **inputs,
                            speaker_embeddings=ref_audio.to(self.device)
                        )
                        
                        # Clean up processed reference audio
                        if os.path.exists(processed_audio):
                            os.remove(processed_audio)
                    else:
                        # Generate without reference
                        generated_audio = self.fallback_model.generate(**inputs)
                else:
                    # Generate speech without reference audio
                    generated_audio = self.fallback_model.generate(**inputs)
            
            # Convert to numpy array
            if isinstance(generated_audio, torch.Tensor):
                audio_np = generated_audio.cpu().numpy()
            else:
                audio_np = generated_audio
            
            # Ensure it's the right shape
            if len(audio_np.shape) == 1:
                audio_np = audio_np.reshape(1, -1)
            
            return audio_np.flatten(), 16000  # SpeechT5 uses 16kHz
            
        except Exception as e:
            logger.error(f"SpeechT5 fallback failed: {str(e)}")
            # Try gTTS as fallback
            if hasattr(self, 'gtts_available') and self.gtts_available:
                logger.info("Trying gTTS as fallback...")
                return self._generate_with_gtts(text, reference_audio_path)
            else:
                logger.error("❌ No TTS libraries available!")
                raise Exception("No TTS libraries available. Please ensure SpeechT5, gTTS, or IndexTTS models are properly installed.")
    
    def _generate_simple_audio(self, text):
        """Generate simple audio as ultimate fallback"""
        logger.info("Using simple audio generation as fallback...")
        
        duration = max(1.0, len(text) * 0.12)
        sample_rate = 22050
        samples = int(duration * sample_rate)
        
        # Generate a more sophisticated audio than just sine waves
        t = np.linspace(0, duration, samples)
        base_freq = 120  # Lower frequency for more natural sound
        
        # Create audio with text-based characteristics
        audio_data = np.zeros(samples)
        
        # Process word by word for more natural rhythm
        words = text.split()
        if not words:
            words = [text]
        
        for word_idx, word in enumerate(words):
            word_start = int(word_idx * samples / len(words))
            word_end = int((word_idx + 1) * samples / len(words))
            word_t = t[word_start:word_end]
            
            word_audio = np.zeros(word_end - word_start)
            
            for char_idx, char in enumerate(word):
                char_start = int(char_idx * (word_end - word_start) / len(word))
                char_end = int((char_idx + 1) * (word_end - word_start) / len(word))
                
                if char_start < char_end and char.isalpha():
                    char_t = word_t[char_start:char_end]
                    
                    # More natural frequency mapping
                    if char.lower() in 'aeiou':
                        char_freq = base_freq + (ord(char.lower()) - ord('a')) * 6
                        char_audio = 0.3 * np.sin(2 * np.pi * char_freq * char_t)
                        char_audio += 0.1 * np.sin(2 * np.pi * char_freq * 2.1 * char_t)
                        char_audio += 0.05 * np.sin(2 * np.pi * char_freq * 3.2 * char_t)
                    else:
                        char_freq = base_freq + (ord(char.lower()) - ord('a')) * 4
                        char_audio = 0.2 * np.sin(2 * np.pi * char_freq * char_t)
                        char_audio += 0.08 * np.sin(2 * np.pi * char_freq * 2.5 * char_t)
                        # Make consonants shorter
                        char_audio *= np.exp(-char_t * 1.5)
                    
                    word_audio[char_start:char_end] = char_audio
            
            # Add word-level envelope
            word_envelope = np.exp(-word_t * 0.15) * (1 + 0.1 * np.sin(2 * np.pi * 0.6 * word_t))
            word_audio *= word_envelope
            
            audio_data[word_start:word_end] = word_audio
        
        # Add natural speech-like envelope
        envelope = np.exp(-t * 0.2) * (1 + 0.1 * np.sin(2 * np.pi * 1.0 * t))
        audio_data *= envelope
        
        # Add subtle noise for realism
        noise = np.random.normal(0, 0.01, samples)
        audio_data += noise
        
        # Normalize
        audio_data = audio_data / (np.max(np.abs(audio_data)) + 1e-8) * 0.5
        
        return audio_data, sample_rate

def load_voice_database():
    """Load voice database"""
    if os.path.exists(voice_database):
        with open(voice_database, 'r') as f:
            return json.load(f)
    return {"parent_voices": {}, "default_voices": {}}

def save_voice_database(db):
    """Save voice database"""
    with open(voice_database, 'w') as f:
        json.dump(db, f, indent=2)

def download_indextts_model():
    """Download IndexTTS-1.5 model from Hugging Face"""
    try:
        if os.path.exists(MODEL_DIR):
            logger.info("IndexTTS model already downloaded")
            return True
        
        logger.info("Downloading IndexTTS-1.5 model from Hugging Face...")
        snapshot_download(
            repo_id="IndexTeam/IndexTTS-1.5",
            local_dir=MODEL_DIR,
            local_dir_use_symlinks=False
        )
        logger.info("✅ IndexTTS model downloaded successfully!")
        return True
    except Exception as e:
        logger.error(f"❌ Failed to download model: {str(e)}")
        return False

def load_model():
    """Load IndexTTS-1.5 model for high-quality voice cloning"""
    global indextts_model, indextts_config
    
    if indextts_model is not None:
        return indextts_model
    
    try:
        logger.info("Loading IndexTTS-1.5 model for storytelling...")
        
        # Download model if not exists
        if not download_indextts_model():
            return "❌ Failed to download IndexTTS model"
        
        # Initialize IndexTTS
        config_path = os.path.join(MODEL_DIR, "config.yaml")
        indextts_model = IndexTTS(MODEL_DIR, config_path)
        
        # Load config and models
        if not indextts_model.load_config():
            return "❌ Failed to load IndexTTS config"
        
        if not indextts_model.load_models():
            return "❌ Failed to load IndexTTS models"
        
        logger.info("✅ IndexTTS-1.5 model loaded successfully!")
        return indextts_model
        
    except Exception as e:
        logger.error(f"❌ Failed to load model: {str(e)}")
        return f"❌ Failed to load model: {str(e)}"

def process_reference_audio(audio_path):
    """Process reference audio to the format expected by IndexTTS"""
    if audio_path is None:
        return None
        
    target_sr = 22050  # IndexTTS typically uses 22.05kHz
    
    try:
        y, sr = librosa.load(audio_path, sr=None, mono=True)
        if sr != target_sr:
            y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        
        # Trim audio to optimal length (3-10 seconds for voice cloning)
        max_duration = 10.0
        duration = len(y) / target_sr
        if duration > max_duration:
            logger.info(f"Trimming audio from {duration:.2f}s to {max_duration}s")
            max_samples = int(max_duration * target_sr)
            y = y[:max_samples]
        
        random_hex = os.urandom(8).hex()
        processed_path = os.path.join(tempfile.gettempdir(), f"processed_ref_{random_hex}.wav")
        sf.write(processed_path, y, target_sr, format='WAV')
        return processed_path
    except Exception as e:
        logger.error(f"Error processing reference audio: {str(e)}")
        return None

def generate_speech(text, reference_audio, transcript, cloning_type):
    """Generate speech using IndexTTS-1.5 for storytelling-level quality"""
    global indextts_model
    
    if indextts_model is None:
        return None, "❌ Model not loaded. Please load the model first."
    
    if not text or not str(text).strip():
        return None, "❌ Please enter some text to synthesize."
    
    try:
        text = str(text) if text else ""
        text_preview = text[:50] if text else "None"
        logger.info(f"Generating storytelling-quality speech for: '{text_preview}...'")
        
        # Process reference audio if provided
        processed_ref_audio = None
        if reference_audio is not None:
            processed_ref_audio = process_reference_audio(reference_audio)
            if processed_ref_audio is None:
                return None, "❌ Failed to process reference audio."
        
        # Generate speech using IndexTTS
        logger.info("Generating speech with IndexTTS-1.5 (storytelling quality)...")
        
        audio_data, sample_rate = indextts_model.generate_speech(text, processed_ref_audio)
        
        # Clean up processed reference audio
        if processed_ref_audio and os.path.exists(processed_ref_audio):
            os.remove(processed_ref_audio)
        
        # Save to temporary file
        random_hex = os.urandom(8).hex()
        output_path = os.path.join(tempfile.gettempdir(), f"output_{random_hex}.wav")
        sf.write(output_path, audio_data, sample_rate, format='WAV')
        
        logger.info("✅ Storytelling-quality speech generated successfully!")
        return output_path, "✅ High-quality storytelling speech generated successfully!"
        
    except Exception as e:
        logger.error(f"Error generating speech: {str(e)}")
        return None, f"❌ Error generating speech: {str(e)}"

def save_parent_voice(parent_id, voice_file, parent_name):
    """Save parent's voice for future use"""
    try:
        if voice_file is None:
            return "❌ Please upload a voice file first."
        
        if not parent_id.strip():
            return "❌ Please enter a parent ID."
        
        # Create voices directory
        voices_dir.mkdir(exist_ok=True)
        
        # Save voice file
        voice_path = voices_dir / f"parent_{parent_id}.wav"
        shutil.copy2(voice_file, voice_path)
        
        # Update database
        db = load_voice_database()
        db["parent_voices"][f"parent_{parent_id}"] = {
            "parent_id": parent_id,
            "parent_name": parent_name,
            "file_path": str(voice_path),
            "type": "parent"
        }
        save_voice_database(db)
        
        return f"✅ Parent voice saved successfully! Voice ID: parent_{parent_id}"
        
    except Exception as e:
        return f"❌ Failed to save parent voice: {str(e)}"

def get_voice_options():
    """Get available voice options"""
    db = load_voice_database()
    
    options = []
    
    # Add default voices
    for voice_id, voice_info in db["default_voices"].items():
        options.append(f"Default: {voice_info['name']}")
    
    # Add parent voices
    for voice_id, voice_info in db["parent_voices"].items():
        parent_name = voice_info.get("parent_name", "Unknown")
        options.append(f"Parent: {parent_name} ({voice_info['parent_id']})")
    
    return options

def get_voice_options_string():
    """Get available voice options as a string for display"""
    options = get_voice_options()
    if not options:
        return "No voices saved yet"
    return "\n".join(options)

def generate_story_with_voice(story_text, voice_selection, custom_voice_file):
    """Generate story using selected voice with IndexTTS-1.5 storytelling quality"""
    global indextts_model
    
    if indextts_model is None:
        return None, "❌ Model not loaded. Please load the model first."
    
    if not story_text.strip():
        return None, "❌ Please enter story text."
    
    try:
        # Determine which voice to use
        if voice_selection and isinstance(voice_selection, str) and voice_selection.startswith("Parent:"):
            # Use saved parent voice
            parent_id = voice_selection.split("(")[-1].rstrip(")")
            db = load_voice_database()
            voice_path = db["parent_voices"].get(f"parent_{parent_id}", {}).get("file_path")
            
            if not voice_path or not os.path.exists(voice_path):
                return None, f"❌ Parent voice not found: {parent_id}"
            
            reference_audio = voice_path
            
        elif custom_voice_file:
            # Use uploaded custom voice
            reference_audio = custom_voice_file
            
        else:
            # Use default voice (no reference audio)
            reference_audio = None
        
        # Process reference audio if provided
        processed_ref_audio = None
        if reference_audio is not None:
            processed_ref_audio = process_reference_audio(reference_audio)
            if processed_ref_audio is None:
                return None, "❌ Failed to process reference audio."
        
        # Generate speech using IndexTTS
        audio_data, sample_rate = indextts_model.generate_speech(story_text, processed_ref_audio)
        
        # Clean up processed reference audio
        if processed_ref_audio and os.path.exists(processed_ref_audio):
            os.remove(processed_ref_audio)
        
        # Save to temporary file
        random_hex = os.urandom(8).hex()
        output_path = os.path.join(tempfile.gettempdir(), f"story_{random_hex}.wav")
        sf.write(output_path, audio_data, sample_rate, format='WAV')
        
        return output_path, "✅ High-quality story generated successfully!"
        
    except Exception as e:
        logger.error(f"Error generating story: {str(e)}")
        return None, f"❌ Error generating story: {str(e)}"

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="High-Quality Voice Cloning for Storytelling", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🎤 High-Quality Voice Cloning for Storytelling
        
        Create personalized stories for kids with **professional storytelling-level voice quality** using IndexTTS-1.5! Parents can save their voice and use it to read stories with natural narration and voice cloning.
        
        **✨ Features:**
        - **IndexTTS-1.5**: High-quality neural text-to-speech
        - **Voice Cloning**: Clone any voice from reference audio
        - **Parent Voice Management**: Save and reuse parent voices
        - **Story Generation**: Create personalized stories with custom voices
        - **Professional Quality**: Uses IndexTTS-1.5 for realistic speech
        """)
        
        with gr.Tab("🎯 Voice Cloning"):
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### Upload Reference Audio")
                    reference_audio = gr.Audio(
                        label="Reference Audio (3-10 seconds) - Upload a clear audio file of the voice you want to clone",
                        type="filepath"
                    )
                    
                    cloning_type = gr.Radio(
                        choices=["Shallow Clone", "Deep Clone"],
                        value="Shallow Clone",
                        label="Cloning Type - Deep clone requires transcript but gives better quality"
                    )
                    
                    transcript = gr.Textbox(
                        label="Reference Audio Transcript (for Deep Clone) - Only needed for deep cloning",
                        placeholder="Enter the transcript of the reference audio..."
                    )
                    
                    text_input = gr.Textbox(
                        label="Text to Synthesize",
                        placeholder="Enter the text you want to convert to speech...",
                        lines=3
                    )
                    
                    generate_btn = gr.Button("🎤 Generate Speech", variant="primary")
                
                with gr.Column():
                    gr.Markdown("### Generated Speech")
                    output_audio = gr.Audio(label="Generated Speech", type="filepath")
                    status_text = gr.Textbox(label="Status", interactive=False)
        
        with gr.Tab("👨‍👩‍👧‍👦 Parent Voice Management"):
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### Save Parent's Voice")
                    parent_id = gr.Textbox(
                        label="Parent ID",
                        placeholder="Enter a unique ID for this parent (e.g., 'mom_123')"
                    )
                    parent_name = gr.Textbox(
                        label="Parent Name (Optional)",
                        placeholder="Enter parent's name (e.g., 'Mom', 'Dad')"
                    )
                    parent_voice_file = gr.Audio(
                        label="Parent Voice Sample (3-10 seconds)",
                        type="filepath"
                    )
                    save_parent_btn = gr.Button("💾 Save Parent Voice", variant="primary")
                    
                with gr.Column():
                    gr.Markdown("### Voice Management")
                    parent_status = gr.Textbox(label="Save Status", interactive=False)
                    voice_list = gr.Textbox(
                        label="Saved Voices",
                        value="No voices saved yet",
                        interactive=False,
                        lines=5
                    )
                    refresh_btn = gr.Button("🔄 Refresh Voice List")
        
        with gr.Tab("📚 Story Generation"):
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### Generate Story with Voice")
                    story_text = gr.Textbox(
                        label="Story Text",
                        placeholder="Enter your story text here...",
                        lines=5
                    )
                    
                    voice_selection = gr.Dropdown(
                        choices=[],
                        label="Choose Voice - Select a saved voice or upload a custom one",
                        allow_custom_value=True
                    )
                    
                    custom_voice_file = gr.Audio(
                        label="Custom Voice (Optional) - Upload a custom voice if not using saved voices",
                        type="filepath"
                    )
                    
                    generate_story_btn = gr.Button("📖 Generate Story", variant="primary")
                    
                with gr.Column():
                    gr.Markdown("### Generated Story")
                    story_audio = gr.Audio(label="Story Audio", type="filepath")
                    story_status = gr.Textbox(label="Status", interactive=False)
        
        with gr.Tab("⚙️ Model Control"):
            gr.Markdown("### Model Management")
            load_btn = gr.Button("🔄 Load Model", variant="secondary")
            model_status = gr.Textbox(label="Model Status", interactive=False)
            
            gr.Markdown("""
            ### How to Use:
            1. **Load Model**: Click to load IndexTTS-1.5 model (with fallbacks)
            2. **Save Parent Voice**: Upload parent's voice and save it
            3. **Generate Story**: Choose a voice and generate story audio
            4. **Voice Cloning**: Use any voice sample for cloning
            
            ### Speech Generation:
            - **IndexTTS-1.5**: High-quality neural text-to-speech with voice cloning
            - **Professional Quality**: Uses actual neural networks for realistic speech
            - **Voice Cloning**: Clone any voice from reference audio
            
            ### Tips:
            - Use clear, high-quality audio (3-10 seconds)
            - Avoid background noise
            - For best results, use the same voice consistently
            - The app will automatically use the best available method
            """)
        
        # Event handlers
        load_btn.click(fn=load_model, outputs=model_status)
        
        generate_btn.click(
            fn=generate_speech,
            inputs=[text_input, reference_audio, transcript, cloning_type],
            outputs=[output_audio, status_text]
        )
        
        save_parent_btn.click(
            fn=save_parent_voice,
            inputs=[parent_id, parent_voice_file, parent_name],
            outputs=parent_status
        )
        
        refresh_btn.click(
            fn=get_voice_options_string,
            outputs=voice_list
        )
        
        generate_story_btn.click(
            fn=generate_story_with_voice,
            inputs=[story_text, voice_selection, custom_voice_file],
            outputs=[story_audio, story_status]
        )
        
        # Auto-load model on startup
        demo.load(fn=load_model, outputs=model_status)
        demo.load(fn=get_voice_options_string, outputs=voice_list)
        demo.load(fn=get_voice_options, outputs=voice_selection)
    
    return demo

# Create and launch the interface
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )