Spaces:

dofbi
/

galsenai-xtts-v2-wolof-inference

Running on Zero

App Files Files Community

dofbi commited on Dec 19, 2024

Commit

3ef36df

1 Parent(s): cbfbd37

update

Browse files

Files changed (11) hide show

TTS/tts/datasets/formatters.py +1 -0
TTS/tts/layers/xtts/gpt.py +57 -0
TTS/tts/layers/xtts/tokenizer.py +3 -2
TTS/tts/layers/xtts/trainer/dataset.py +36 -24
TTS/tts/layers/xtts/trainer/dvae_dataset.py +132 -0
TTS/tts/layers/xtts/trainer/gpt_trainer.py +36 -1
TTS/tts/models/xtts.py +3 -2
app.py +50 -23
local_model/__pycache__/inference.cpython-310.pyc +0 -0
local_model/inference.py +198 -0
requirements.txt +66 -17

TTS/tts/datasets/formatters.py CHANGED Viewed

@@ -80,6 +80,7 @@ def coqui(root_path, meta_file, ignored_speakers=None):
             {
                 "text": row.text,
                 "audio_file": audio_path,
                 "speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
                 "emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
                 "root_path": root_path,

             {
                 "text": row.text,
                 "audio_file": audio_path,
+                "ref_file": "null" if "ref_file" not in metadata.columns else os.path.join(root_path, row.ref_file),
                 "speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
                 "emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
                 "root_path": root_path,

TTS/tts/layers/xtts/gpt.py CHANGED Viewed

@@ -184,6 +184,63 @@ class GPT(nn.Module):
             # XTTS v1
             self.prompt_embedding = nn.Embedding(self.num_audio_tokens, model_dim)
             self.prompt_pos_embedding = LearnedPositionEmbeddings(24 * 9, model_dim)
     def get_grad_norm_parameter_groups(self):
         return {

             # XTTS v1
             self.prompt_embedding = nn.Embedding(self.num_audio_tokens, model_dim)
             self.prompt_pos_embedding = LearnedPositionEmbeddings(24 * 9, model_dim)
+    def resize_text_embeddings(self, new_num_tokens: int):
+        old_embeddings_requires_grad = self.text_embedding.weight.requires_grad
+        old_num_tokens, old_embedding_dim = self.text_embedding.weight.size()
+        if old_num_tokens == new_num_tokens:
+            return
+        new_embeddings = nn.Embedding(
+            new_num_tokens,
+            old_embedding_dim,
+            device=self.text_embedding.weight.device,
+            dtype=self.text_embedding.weight.dtype,
+        )
+        # numbers of tokens to copy
+        n = min(old_num_tokens, new_num_tokens)
+        new_embeddings.weight.data[:n, :] = self.text_embedding.weight.data[:n, :]
+        self.text_embedding.weight.data = new_embeddings.weight.data
+        self.text_embedding.num_embeddings = new_embeddings.weight.data.shape[0]
+        if self.text_embedding.padding_idx is not None and (new_num_tokens - 1) < self.text_embedding.padding_idx:
+            self.text_embedding.padding_idx = None
+        self.text_embedding.requires_grad_(old_embeddings_requires_grad)
+    def resize_text_head(self, new_num_tokens: int):
+        old_lm_head_requires_grad = self.text_head.weight.requires_grad
+        old_num_tokens, old_lm_head_dim = self.text_head.weight.size()
+        new_lm_head_shape = (old_lm_head_dim, new_num_tokens)
+        has_new_lm_head_bias = self.text_head.bias is not None
+        new_lm_head = nn.Linear(
+            *new_lm_head_shape,
+            bias=has_new_lm_head_bias,
+            device=self.text_head.weight.device,
+            dtype=self.text_head.weight.dtype,
+        )
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        new_lm_head.weight.data[:num_tokens_to_copy, :] = self.text_head.weight.data[:num_tokens_to_copy, :]
+        # Copy bias weights to new lm head
+        if has_new_lm_head_bias:
+            new_lm_head.bias.data[:num_tokens_to_copy] = self.text_head.bias.data[:num_tokens_to_copy]
+        self.text_head = new_lm_head
+        self.text_head.requires_grad_(old_lm_head_requires_grad)
+        pass
     def get_grad_norm_parameter_groups(self):
         return {

TTS/tts/layers/xtts/tokenizer.py CHANGED Viewed

@@ -621,7 +621,7 @@ class VoiceBpeTokenizer:
     def check_input_length(self, txt, lang):
         lang = lang.split("-")[0]  # remove the region
-        limit = self.char_limits.get(lang, 250)
         if len(txt) > limit:
             print(
                 f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
@@ -640,7 +640,8 @@ class VoiceBpeTokenizer:
             # @manmay will implement this
             txt = basic_cleaners(txt)
         else:
-            raise NotImplementedError(f"Language '{lang}' is not supported.")
         return txt
     def encode(self, txt, lang):

     def check_input_length(self, txt, lang):
         lang = lang.split("-")[0]  # remove the region
+        limit = self.char_limits.get(lang, 300)
         if len(txt) > limit:
             print(
                 f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
             # @manmay will implement this
             txt = basic_cleaners(txt)
         else:
+            txt = basic_cleaners(txt)
+            # print(f"[!] Warning: Preprocess [Language '{lang}'] text is not implemented, use `basic_cleaners` instead.")
         return txt
     def encode(self, txt, lang):

TTS/tts/layers/xtts/trainer/dataset.py CHANGED Viewed

@@ -23,29 +23,41 @@ def key_samples_by_col(samples, col):
     return samples_by_col
-def get_prompt_slice(gt_path, max_sample_length, min_sample_length, sample_rate, is_eval=False):
-    rel_clip = load_audio(gt_path, sample_rate)
-    # if eval uses a middle size sample when it is possible to be more reproducible
-    if is_eval:
-        sample_length = int((min_sample_length + max_sample_length) / 2)
-    else:
-        sample_length = random.randint(min_sample_length, max_sample_length)
-    gap = rel_clip.shape[-1] - sample_length
-    if gap < 0:
-        sample_length = rel_clip.shape[-1] // 2
-    gap = rel_clip.shape[-1] - sample_length
-    # if eval start always from the position 0 to be more reproducible
-    if is_eval:
-        rand_start = 0
     else:
-        rand_start = random.randint(0, gap)
-    rand_end = rand_start + sample_length
-    rel_clip = rel_clip[:, rand_start:rand_end]
-    rel_clip = F.pad(rel_clip, pad=(0, max_sample_length - rel_clip.shape[-1]))
-    cond_idxs = [rand_start, rand_end]
-    return rel_clip, rel_clip.shape[-1], cond_idxs
 class XTTSDataset(torch.utils.data.Dataset):
@@ -110,14 +122,14 @@ class XTTSDataset(torch.utils.data.Dataset):
         wav = load_audio(audiopath, self.sample_rate)
         if text is None or len(text.strip()) == 0:
             raise ValueError
-        if wav is None or wav.shape[-1] < (0.5 * self.sample_rate):
             # Ultra short clips are also useless (and can cause problems within some models).
             raise ValueError
         if self.use_masking_gt_prompt_approach:
             # get a slice from GT to condition the model
             cond, _, cond_idxs = get_prompt_slice(
-                audiopath, self.max_conditioning_length, self.min_conditioning_length, self.sample_rate, self.is_eval
             )
             # if use masking do not use cond_len
             cond_len = torch.nan
@@ -128,7 +140,7 @@ class XTTSDataset(torch.utils.data.Dataset):
                 else audiopath
             )
             cond, cond_len, _ = get_prompt_slice(
-                ref_sample, self.max_conditioning_length, self.min_conditioning_length, self.sample_rate, self.is_eval
             )
             # if do not use masking use cond_len
             cond_idxs = torch.nan

     return samples_by_col
+def get_prompt_slice(gt_path, max_sample_length, min_sample_length, sample_rate, is_eval=False, ref_path="null"):
+    if ref_path == "null":
+        rel_clip = load_audio(gt_path, sample_rate)
+        # if eval uses a middle size sample when it is possible to be more reproducible
+        if is_eval:
+            sample_length = int((min_sample_length + max_sample_length) / 2)
+        else:
+            sample_length = random.randint(min_sample_length, max_sample_length)
+        gap = rel_clip.shape[-1] - sample_length
+        if gap < 0:
+            sample_length = rel_clip.shape[-1] // 2
+        gap = rel_clip.shape[-1] - sample_length
+        # if eval start always from the position 0 to be more reproducible
+        if is_eval:
+            rand_start = 0
+        else:
+            rand_start = random.randint(0, gap)
+        rand_end = rand_start + sample_length
+        rel_clip = rel_clip[:, rand_start:rand_end]
+        rel_clip = F.pad(rel_clip, pad=(0, max_sample_length - rel_clip.shape[-1]))
+        cond_idxs = [rand_start, rand_end]
+        return rel_clip, rel_clip.shape[-1], cond_idxs
     else:
+        rel_clip = load_audio(ref_path, sample_rate)
+        sample_length = min(max_sample_length, rel_clip.shape[-1])
+        rel_clip = rel_clip[:, :sample_length]
+        rel_clip = F.pad(rel_clip, pad=(0, max_sample_length - rel_clip.shape[-1]))
+        cond_idxs = [0, sample_length]
+        return rel_clip, rel_clip.shape[-1], cond_idxs
 class XTTSDataset(torch.utils.data.Dataset):
         wav = load_audio(audiopath, self.sample_rate)
         if text is None or len(text.strip()) == 0:
             raise ValueError
+        if wav is None or wav.shape[-1] < (0.2 * self.sample_rate):
             # Ultra short clips are also useless (and can cause problems within some models).
             raise ValueError
         if self.use_masking_gt_prompt_approach:
             # get a slice from GT to condition the model
             cond, _, cond_idxs = get_prompt_slice(
+                audiopath, self.max_conditioning_length, self.min_conditioning_length, self.sample_rate, self.is_eval, sample["ref_file"]
             )
             # if use masking do not use cond_len
             cond_len = torch.nan
                 else audiopath
             )
             cond, cond_len, _ = get_prompt_slice(
+                ref_sample, self.max_conditioning_length, self.min_conditioning_length, self.sample_rate, self.is_eval, sample["ref_file"]
             )
             # if do not use masking use cond_len
             cond_idxs = torch.nan

TTS/tts/layers/xtts/trainer/dvae_dataset.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import torch
+import random
+from TTS.tts.models.xtts import load_audio
+torch.set_num_threads(1)
+def key_samples_by_col(samples, col):
+    """Returns a dictionary of samples keyed by language."""
+    samples_by_col = {}
+    for sample in samples:
+        col_val = sample[col]
+        assert isinstance(col_val, str)
+        if col_val not in samples_by_col:
+            samples_by_col[col_val] = []
+        samples_by_col[col_val].append(sample)
+    return samples_by_col
+class DVAEDataset(torch.utils.data.Dataset):
+    def __init__(self, samples, sample_rate, is_eval, max_wav_len=255995):
+        self.sample_rate = sample_rate
+        self.is_eval = is_eval
+        self.max_wav_len = max_wav_len
+        self.samples = samples
+        self.training_seed = 1
+        self.failed_samples = set()
+        if not is_eval:
+            random.seed(self.training_seed)
+            # random.shuffle(self.samples)
+            random.shuffle(self.samples)
+            # order by language
+            self.samples = key_samples_by_col(self.samples, "language")
+            print(" > Sampling by language:", self.samples.keys())
+        else:
+            # for evaluation load and check samples that are corrupted to ensures the reproducibility
+            self.check_eval_samples()
+    def check_eval_samples(self):
+        print(" > Filtering invalid eval samples!!")
+        new_samples = []
+        for sample in self.samples:
+            try:
+                _, wav = self.load_item(sample)
+            except:
+                continue
+            # Basically, this audio file is nonexistent or too long to be supported by the dataset.
+            if (
+                wav is None
+                or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len)
+            ):
+                continue
+            new_samples.append(sample)
+        self.samples = new_samples
+        print(" > Total eval samples after filtering:", len(self.samples))
+    def load_item(self, sample):
+        audiopath = sample["audio_file"]
+        wav = load_audio(audiopath, self.sample_rate)
+        if wav is None or wav.shape[-1] < (0.5 * self.sample_rate):
+            # Ultra short clips are also useless (and can cause problems within some models).
+            raise ValueError
+        return audiopath, wav
+    def __getitem__(self, index):
+        if self.is_eval:
+            sample = self.samples[index]
+            sample_id = str(index)
+        else:
+            # select a random language
+            lang = random.choice(list(self.samples.keys()))
+            # select random sample
+            index = random.randint(0, len(self.samples[lang]) - 1)
+            sample = self.samples[lang][index]
+            # a unique id for each sampel to deal with fails
+            sample_id = lang + "_" + str(index)
+        # ignore samples that we already know that is not valid ones
+        if sample_id in self.failed_samples:
+            # call get item again to get other sample
+            return self[1]
+        # try to load the sample, if fails added it to the failed samples list
+        try:
+            audiopath, wav = self.load_item(sample)
+        except:
+            self.failed_samples.add(sample_id)
+            return self[1]
+        # check if the audio and text size limits and if it out of the limits, added it failed_samples
+        if (
+            wav is None
+            or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len)
+        ):
+            # Basically, this audio file is nonexistent or too long to be supported by the dataset.
+            # It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result.
+            self.failed_samples.add(sample_id)
+            return self[1]
+        res = {
+            "wav": wav,
+            "wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long),
+            "filenames": audiopath,
+        }
+        return res
+    def __len__(self):
+        if self.is_eval:
+            return len(self.samples)
+        return sum([len(v) for v in self.samples.values()])
+    def collate_fn(self, batch):
+        # convert list of dicts to dict of lists
+        B = len(batch)
+        batch = {k: [dic[k] for dic in batch] for k in batch[0]}
+        # stack for features that already have the same shape
+        batch["wav_lengths"] = torch.stack(batch["wav_lengths"])
+        max_wav_len = batch["wav_lengths"].max()
+        # create padding tensors
+        wav_padded = torch.FloatTensor(B, 1, max_wav_len)
+        # initialize tensors for zero padding
+        wav_padded = wav_padded.zero_()
+        for i in range(B):
+            wav = batch["wav"][i]
+            wav_padded[i, :, : batch["wav_lengths"][i]] = torch.FloatTensor(wav)
+        batch["wav"] = wav_padded
+        return batch

TTS/tts/layers/xtts/trainer/gpt_trainer.py CHANGED Viewed

@@ -97,7 +97,8 @@ class GPTTrainer(BaseTTS):
                 states_keys = list(gpt_checkpoint.keys())
                 for key in states_keys:
                     if "gpt." in key:
-                        new_key = key.replace("gpt.", "")
                         gpt_checkpoint[new_key] = gpt_checkpoint[key]
                         del gpt_checkpoint[key]
                     else:
@@ -484,6 +485,40 @@ class GPTTrainer(BaseTTS):
         state = self.xtts.get_compatible_checkpoint_state_dict(checkpoint_path)
         # load the model weights
         self.xtts.load_state_dict(state, strict=strict)

                 states_keys = list(gpt_checkpoint.keys())
                 for key in states_keys:
                     if "gpt." in key:
+                        # new_key = key.replace("gpt.", "")
+                        new_key = key[4:]
                         gpt_checkpoint[new_key] = gpt_checkpoint[key]
                         del gpt_checkpoint[key]
                     else:
         state = self.xtts.get_compatible_checkpoint_state_dict(checkpoint_path)
+        # edit checkpoint if the number of tokens is changed to ensures the better transfer learning possible
+        if (
+            "gpt.text_embedding.weight" in state
+            and state["gpt.text_embedding.weight"].shape != self.xtts.gpt.text_embedding.weight.shape
+        ):
+            num_new_tokens = (
+                self.xtts.gpt.text_embedding.weight.shape[0] - state["gpt.text_embedding.weight"].shape[0]
+            )
+            print(f" > Loading checkpoint with {num_new_tokens} additional tokens.")
+            # add new tokens to a linear layer (text_head)
+            emb_g = state["gpt.text_embedding.weight"]
+            new_row = torch.randn(num_new_tokens, emb_g.shape[1])
+            start_token_row = emb_g[-1, :]
+            emb_g = torch.cat([emb_g, new_row], axis=0)
+            emb_g[-1, :] = start_token_row
+            state["gpt.text_embedding.weight"] = emb_g
+            # add new weights to the linear layer (text_head)
+            text_head_weight = state["gpt.text_head.weight"]
+            start_token_row = text_head_weight[-1, :]
+            new_entry = torch.randn(num_new_tokens, self.xtts.gpt.text_head.weight.shape[1])
+            text_head_weight = torch.cat([text_head_weight, new_entry], axis=0)
+            text_head_weight[-1, :] = start_token_row
+            state["gpt.text_head.weight"] = text_head_weight
+            # add new biases to the linear layer (text_head)
+            text_head_bias = state["gpt.text_head.bias"]
+            start_token_row = text_head_bias[-1]
+            new_bias_entry = torch.zeros(num_new_tokens)
+            text_head_bias = torch.cat([text_head_bias, new_bias_entry], axis=0)
+            text_head_bias[-1] = start_token_row
+            state["gpt.text_head.bias"] = text_head_bias
         # load the model weights
         self.xtts.load_state_dict(state, strict=strict)

TTS/tts/models/xtts.py CHANGED Viewed

@@ -523,7 +523,7 @@ class Xtts(BaseTTS):
         gpt_cond_latent = gpt_cond_latent.to(self.device)
         speaker_embedding = speaker_embedding.to(self.device)
         if enable_text_splitting:
-            text = split_sentence(text, language, self.tokenizer.char_limits[language])
         else:
             text = [text]
@@ -553,6 +553,7 @@ class Xtts(BaseTTS):
                     output_attentions=False,
                     **hf_generate_kwargs,
                 )
                 expected_output_len = torch.tensor(
                     [gpt_codes.shape[-1] * self.gpt.code_stride_len], device=text_tokens.device
                 )
@@ -633,7 +634,7 @@ class Xtts(BaseTTS):
         gpt_cond_latent = gpt_cond_latent.to(self.device)
         speaker_embedding = speaker_embedding.to(self.device)
         if enable_text_splitting:
-            text = split_sentence(text, language, self.tokenizer.char_limits[language])
         else:
             text = [text]

         gpt_cond_latent = gpt_cond_latent.to(self.device)
         speaker_embedding = speaker_embedding.to(self.device)
         if enable_text_splitting:
+            text = split_sentence(text, language, self.tokenizer.char_limits.get(language, 250))
         else:
             text = [text]
                     output_attentions=False,
                     **hf_generate_kwargs,
                 )
                 expected_output_len = torch.tensor(
                     [gpt_codes.shape[-1] * self.gpt.code_stride_len], device=text_tokens.device
                 )
         gpt_cond_latent = gpt_cond_latent.to(self.device)
         speaker_embedding = speaker_embedding.to(self.device)
         if enable_text_splitting:
+            text = split_sentence(text, language, self.tokenizer.char_limits.get(language, 250))
         else:
             text = [text]

app.py CHANGED Viewed

@@ -6,45 +6,72 @@ import sys
 import soundfile as sf
 import numpy as np
 import logging
 # Configuration du logger
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 #Chemin local de téléchargement des fichiers (il faut s'assurer que le dossier existe)
-LOCAL_DOWNLOAD_PATH = "./local_model"
 # Télécharger le script d'inférence
 repo_id = "dofbi/galsenai-xtts-v2-wolof-inference"
-inference_file = hf_hub_download(repo_id=repo_id, filename="inference.py",local_dir=LOCAL_DOWNLOAD_PATH)
 # Ajouter le dossier au chemin de recherche
 sys.path.insert(0, LOCAL_DOWNLOAD_PATH)
-# Importer la fonction à partir du script d'inférence téléchargé
-from inference import generate_audio
-def tts(text, audio_reference):
     logging.debug(f"tts function called with text: {text} and audio_reference: {audio_reference}")
-    if text and audio_reference is not None:
-        #Sauvegarde temporaire de l'audio reference
-        temp_audio_path = "temp_audio_ref.wav"
-        sf.write(temp_audio_path, audio_reference, 44100)
-        logging.debug(f"Audio reference saved to {temp_audio_path}")
-        audio_output, sample_rate = generate_audio(text, temp_audio_path, LOCAL_DOWNLOAD_PATH)
-        logging.debug(f"Audio generated with sample rate: {sample_rate}")
-        return (sample_rate, audio_output)
-    else:
         logging.debug("Text or audio reference is missing")
         return "Veuillez entrer un texte et fournir un audio de référence."
-demo = gr.Interface(
-    fn=tts,
-    inputs=[
-        gr.Textbox(label="Text to synthesize"),
-        gr.Audio(type="numpy", label="Reference audio")
-    ],
-    outputs=gr.Audio(label="Synthesized audio"),
-)
 if __name__ == "__main__":
     demo.launch()

 import soundfile as sf
 import numpy as np
 import logging
+import tempfile
 # Configuration du logger
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 #Chemin local de téléchargement des fichiers (il faut s'assurer que le dossier existe)
+LOCAL_DOWNLOAD_PATH = os.path.dirname("/content") # Utiliser le chemin du script
 # Télécharger le script d'inférence
 repo_id = "dofbi/galsenai-xtts-v2-wolof-inference"
+inference_file = hf_hub_download(repo_id=repo_id, filename="inference.py", local_dir=LOCAL_DOWNLOAD_PATH)
 # Ajouter le dossier au chemin de recherche
 sys.path.insert(0, LOCAL_DOWNLOAD_PATH)
+# Importer la classe à partir du script d'inférence téléchargé
+from inference import WolofXTTSInference
+# Initialiser le modèle une seule fois
+tts_model = WolofXTTSInference()
+def tts(text: str, audio_reference: tuple[int, np.ndarray]) -> tuple[int, np.ndarray] | str:
+    """
+    Synthétise de la parole à partir d'un texte en utilisant un audio de référence.
+    Args:
+        text (str): Le texte à synthétiser.
+        audio_reference (tuple[int, np.ndarray]): Un tuple contenant le taux d'échantillonnage et les données audio de référence.
+    Returns:
+         tuple[int, np.ndarray] | str: un tuple contenant le taux d'échantillonnage et les données audio synthétisées, ou un message d'erreur.
+    """
     logging.debug(f"tts function called with text: {text} and audio_reference: {audio_reference}")
+    if not text or audio_reference is None:
         logging.debug("Text or audio reference is missing")
         return "Veuillez entrer un texte et fournir un audio de référence."
+    try:
+        sample_rate, audio_data = audio_reference
+        # Créer un fichier temporaire pour l'audio de référence
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
+            sf.write(temp_audio_file.name, audio_data, sample_rate)
+            logging.debug(f"Audio reference saved to {temp_audio_file.name}")
+            # Utiliser la méthode generate_audio de la nouvelle classe
+            audio_output, output_sample_rate = tts_model.generate_audio(
+                text,
+                reference_audio=temp_audio_file.name
+            )
+            logging.debug(f"Audio generated with sample rate: {output_sample_rate}")
+            return (output_sample_rate, audio_output)
+    except Exception as e:
+        logging.error(f"Error during audio generation: {e}")
+        return f"Une erreur s'est produite lors de la génération audio: {e}"
 if __name__ == "__main__":
+    demo = gr.Interface(
+        fn=tts,
+        inputs=[
+            gr.Textbox(label="Text to synthesize"),
+            gr.Audio(type="numpy", label="Reference audio")
+        ],
+        outputs=gr.Audio(label="Synthesized audio"),
+    )
     demo.launch()

local_model/__pycache__/inference.cpython-310.pyc ADDED Viewed

Binary file (5.07 kB). View file

local_model/inference.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import torch
+import os
+import logging
+import soundfile as sf
+import numpy as np
+from huggingface_hub import hf_hub_download
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+# --- CONSTANTES ---
+REPO_ID = "dofbi/galsenai-xtts-v2-wolof-inference"
+LOCAL_DIR = "./models"
+class WolofXTTSInference:
+    def __init__(self, repo_id=REPO_ID, local_dir=LOCAL_DIR):
+        # Configuration du logging
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(levelname)s - %(message)s'
+        )
+        self.logger = logging.getLogger(__name__)
+        # Créer le dossier local s'il n'existe pas
+        os.makedirs(local_dir, exist_ok=True)
+        # Téléchargement des fichiers nécessaires
+        try:
+            # Créer les sous-dossiers nécessaires
+            os.makedirs(os.path.join(local_dir, "Anta_GPT_XTTS_Wo"), exist_ok=True)
+            os.makedirs(os.path.join(local_dir, "XTTS_v2.0_original_model_files"), exist_ok=True)
+            # Télécharger le checkpoint
+            self.model_path = hf_hub_download(
+                repo_id=repo_id,
+                filename="Anta_GPT_XTTS_Wo/best_model_89250.pth",
+                local_dir=local_dir
+            )
+            # Télécharger le fichier de configuration
+            self.config_path = hf_hub_download(
+                repo_id=repo_id,
+                filename="Anta_GPT_XTTS_Wo/config.json",
+                local_dir=local_dir
+            )
+            # Télécharger le vocabulaire
+            self.vocab_path = hf_hub_download(
+                repo_id=repo_id,
+                filename="XTTS_v2.0_original_model_files/vocab.json",
+                local_dir=local_dir
+            )
+            # Télécharger l'audio de référence
+            self.reference_audio = hf_hub_download(
+                repo_id=repo_id,
+                filename="anta_sample.wav",
+                local_dir=local_dir
+            )
+        except Exception as e:
+            self.logger.error(f"Erreur lors du téléchargement des fichiers : {e}")
+            raise
+        # Sélection du device
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        # Initialisation du modèle
+        self.model = self._load_model()
+    def _load_model(self):
+        """Charge le modèle XTTS"""
+        try:
+            self.logger.info("Chargement du modèle XTTS...")
+            # Initialisation du modèle
+            config = XttsConfig()
+            config.load_json(self.config_path)
+            model = Xtts.init_from_config(config)
+            # Chargement du checkpoint avec load_checkpoint
+            model.load_checkpoint(config,
+                                checkpoint_path=self.model_path,
+                                vocab_path=self.vocab_path,
+                                use_deepspeed=False
+                             )
+            model.to(self.device)
+            model.eval()  # Mettre le modèle en mode évaluation
+            self.logger.info("Modèle chargé avec succès!")
+            return model
+        except Exception as e:
+            self.logger.error(f"Erreur lors du chargement du modèle : {e}")
+            raise
+    def generate_audio(
+        self,
+        text: str,
+        reference_audio: str = None,
+        speed: float = 1.06,
+        language: str = "wo",
+        output_path: str = None
+    ) -> tuple[np.ndarray, int]:
+        """
+        Génère de l'audio à partir du texte fourni
+        Args:
+            text (str): Texte à convertir en audio
+            reference_audio (str, optional): Chemin vers l'audio de référence. Defaults to None.
+            speed (float, optional): Vitesse de lecture. Defaults to 1.06.
+            language (str, optional): Langue du texte. Defaults to "wo".
+            output_path (str, optional): Chemin de sauvegarde de l'audio généré. Defaults to None.
+        Returns:
+            tuple[np.ndarray, int]: audio_array, sample_rate
+        """
+        if not text:
+           raise ValueError("Le texte ne peut pas être vide.")
+        try:
+            # Utiliser l'audio de référence fourni ou par défaut
+            ref_audio = reference_audio or self.reference_audio
+            # Obtenir les embeddings
+            gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
+                audio_path=[ref_audio],
+                gpt_cond_len=self.model.config.gpt_cond_len,
+                max_ref_length=self.model.config.max_ref_len,
+                sound_norm_refs=self.model.config.sound_norm_refs
+            )
+            # Génération de l'audio
+            result = self.model.inference(
+                text=text.lower(),
+                gpt_cond_latent=gpt_cond_latent,
+                speaker_embedding=speaker_embedding,
+                do_sample=False,
+                speed=speed,
+                language=language,
+                enable_text_splitting=True
+            )
+            # Récupérer le taux d'échantillonnage
+            sample_rate = self.model.config.audio.sample_rate
+            # Sauvegarde optionnelle
+            if output_path:
+                sf.write(output_path, result["wav"], sample_rate)
+                self.logger.info(f"Audio sauvegardé dans {output_path}")
+            return result["wav"], sample_rate
+        except Exception as e:
+            self.logger.error(f"Erreur lors de la génération de l'audio : {e}")
+            raise
+    def generate_audio_from_config(self, text: str, config: dict, output_path: str = None) -> tuple[np.ndarray, int]:
+        """
+        Génère de l'audio à partir du texte et d'un dictionnaire de configuration.
+        Args:
+            text (str): Texte à convertir en audio
+            config (dict): Dictionnaire de configuration (speed, language, reference_audio)
+            output_path (str, optional): Chemin de sauvegarde de l'audio généré. Defaults to None.
+        Returns:
+             tuple[np.ndarray, int]: audio_array, sample_rate
+        """
+        speed = config.get('speed', 1.06)
+        language = config.get('language', "wo")
+        reference_audio = config.get('reference_audio', None)
+        return self.generate_audio(text=text, reference_audio=reference_audio, speed=speed, language=language, output_path=output_path)
+# Exemple d'utilisation
+if __name__ == "__main__":
+    tts = WolofXTTSInference()
+    # Exemple de génération d'audio
+    text = "Màngi tuddu Aadama, di baat bii waa Galsen A.I defar ngir wax ak yéen ci wolof!"
+    # Simple
+    audio, sr = tts.generate_audio(
+        text,
+        output_path="generated_audio.wav"
+    )
+    # Avec une config
+    config_gen_audio = {
+        "speed": 1.2,
+        "language": "wo",
+    }
+    audio, sr = tts.generate_audio_from_config(
+        text=text,
+        config=config_gen_audio,
+        output_path="generated_audio_config.wav"
+    )

requirements.txt CHANGED Viewed

@@ -1,24 +1,73 @@
-torch
 torchaudio
-soundfile
 transformers
-gradio
-huggingface_hub
-tqdm
-coqpit
 trainer
 librosa
-torchaudio
-einops
 pypinyin
 hangul_romanize
 num2words
-spacy
-mutagen
-matplotlib
-pyAudioAnalysis
-eyed3
-hmmlearn
-imblearn
-plotly
-pesq

+gradio
+# core deps
+numpy==1.23.0;python_version<="3.10"
+numpy>=1.24.3;python_version>"3.10"
+matplotlib
+cython>=0.29.30
+scipy>=1.11.2
+torch>=2.1
 torchaudio
 transformers
+gdown
 trainer
+soundfile>=0.12.0
+librosa>=0.10.0
+scikit-learn>=1.3.0
+numba==0.55.1;python_version<"3.9"
+numba>=0.57.0;python_version>="3.9"
+inflect>=5.6.0
+tqdm>=4.64.1
+anyascii>=0.3.0
+pyyaml>=6.0
+fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
+aiohttp>=3.8.1
+packaging>=23.1
+mutagen==1.47.0
 librosa
+# deps for examples
+flask>=2.0.1
+# deps for inference
+pysbd>=0.3.4
+# deps for notebooks
+umap-learn>=0.5.1
+pandas>=1.4,<2.0
+# deps for training
+matplotlib>=3.7.0
+# coqui stack
+trainer>=0.0.36
+# config management
+coqpit>=0.0.16
+# chinese g2p deps
+jieba
 pypinyin
+# korean
 hangul_romanize
+# gruut+supported langs
+gruut[de,es,fr]==2.2.3
+# deps for korean
+jamo
+nltk
+g2pkk>=0.1.1
+# deps for bangla
+bangla
+bnnumerizer
+bnunicodenormalizer
+# deps for tortoise
+einops>=0.6.0
+transformers>=4.45.2
+# deps for bark
+encodec>=0.1.1
+# deps for XTTS
+unidecode>=1.3.2
 num2words
+# spacy[ja]>=3
+tokenizers==0.20.1
+vinorm==2.0.7
+underthesea==6.8.4
+# remove silence
+hmmlearn==0.3.3
+eyed3==0.9.7
+pesq==0.0.4
+pydub==0.25.1
+pyAudioAnalysis==0.3.14
+ffmpeg-python==0.2.0