Spaces:

Warholt
/

CaroTTS-DE

Running on Zero

App Files Files Community

Warholt commited on 20 days ago

Commit

a1e382b

1 Parent(s): f3d1bee

Add app, track models via lfs

Browse files

Files changed (4) hide show

.gitattributes +2 -0
app.py +148 -0
char_tokenizers.py +140 -0
requirements.txt +61 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import gradio as gr
+import onnxruntime as ort
+import numpy as np
+from char_tokenizers import GermanCharsTokenizer
+# Initialize tokenizer
+TOKENIZER = GermanCharsTokenizer()
+# Model paths
+MODELS = {
+    "Caro": {
+        "fastpitch": "onnx/caro_fastpitch.onnx",
+        "hifigan": "onnx/caro_hifigan.onnx",
+    },
+    "Karlsson": {
+        "fastpitch": "onnx/karlsson_fastpitch.onnx",
+        "hifigan": "onnx/karlsson_hifigan.onnx",
+    },
+}
+# Load models
+print("Loading ONNX models...")
+sessions = {}
+for voice_name, paths in MODELS.items():
+    print(f"Loading {voice_name}...")
+    sessions[voice_name] = {
+        "fastpitch": ort.InferenceSession(paths["fastpitch"]),
+        "hifigan": ort.InferenceSession(paths["hifigan"]),
+    }
+print("Models loaded successfully!")
+def synthesize_speech(text: str, voice: str, pace: float = 1.0):
+    """
+    Synthesize speech from text using the selected voice.
+    Args:
+        text: Input text to synthesize
+        voice: Voice to use (Caro or Karlsson)
+        pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
+    Returns:
+        Tuple of (sample_rate, audio_array)
+    """
+    if not text.strip():
+        return None
+    # Tokenize text
+    tokens = TOKENIZER.encode(text)
+    # Prepare inputs for FastPitch
+    paces = np.zeros(len(tokens), dtype=np.float32) + pace
+    pitches = np.zeros(len(tokens), dtype=np.float32)  # Keep pitch at 0.0
+    inputs = {
+        "text": np.array([tokens], dtype=np.int64),
+        "pace": np.array([paces], dtype=np.float32),
+        "pitch": np.array([pitches], dtype=np.float32),
+    }
+    # Generate spectrogram with FastPitch
+    fastpitch_session = sessions[voice]["fastpitch"]
+    spec = fastpitch_session.run(None, inputs)[0]
+    # Generate audio with HiFiGAN
+    hifigan_session = sessions[voice]["hifigan"]
+    gan_inputs = {"spec": spec}
+    audio = hifigan_session.run(None, gan_inputs)[0]
+    # Return sample rate and audio
+    sample_rate = 44100
+    audio_array = audio.squeeze()
+    return (sample_rate, audio_array)
+# Create Gradio interface
+with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
+    gr.Markdown(
+        """
+        # 🎙️ German Text-to-Speech
+        Generate German speech using two different voices: **Caro** and **Karlsson**.
+        Enter your German text below and select a voice to synthesize speech.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Text to synthesize",
+                placeholder="Geben Sie hier Ihren deutschen Text ein...",
+                lines=5,
+                value="Hallo! Willkommen zur deutschen Sprachsynthese.",
+            )
+            voice_dropdown = gr.Dropdown(
+                choices=list(MODELS.keys()), label="Voice", value="Karlsson"
+            )
+            pace_slider = gr.Slider(
+                minimum=0.5,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+                label="Speaking Rate",
+                info="1.0 is normal speed",
+            )
+            generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Audio", type="numpy")
+    gr.Examples(
+        examples=[
+            ["Guten Tag! Wie geht es Ihnen heute?", "Caro", 1.0],
+            [
+                "Die Wissenschaft hat in den letzten Jahren große Fortschritte gemacht.",
+                "Karlsson",
+                1.0,
+            ],
+            [
+                "Es war einmal ein kleines Mädchen, das durch den Wald spazierte.",
+                "Caro",
+                0.9,
+            ],
+            [
+                "Berlin ist die Hauptstadt und zugleich ein Land der Bundesrepublik Deutschland.",
+                "Karlsson",
+                1.0,
+            ],
+        ],
+        inputs=[text_input, voice_dropdown, pace_slider],
+        outputs=audio_output,
+        fn=synthesize_speech,
+        cache_examples=False,
+    )
+    generate_btn.click(
+        fn=synthesize_speech,
+        inputs=[text_input, voice_dropdown, pace_slider],
+        outputs=audio_output,
+    )
+if __name__ == "__main__":
+    demo.launch()

char_tokenizers.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import logging
+from typing import List
+import unicodedata
+from abc import ABC, abstractmethod
+def normalize_unicode_text(text: str) -> str:
+    if not unicodedata.is_normalized("NFC", text):
+        text = unicodedata.normalize("NFC", text)
+    return text
+def any_locale_text_preprocessing(text: str) -> str:
+    res = []
+    for c in normalize_unicode_text(text):
+        if c in ['’']:
+            res.append("'")
+        else:
+            res.append(c)
+    return ''.join(res)
+class BaseTokenizer(ABC):
+    PAD, BLANK, OOV = '<pad>', '<blank>', '<oov>'
+    def __init__(self, tokens, *, pad=PAD, blank=BLANK, oov=OOV, sep='', add_blank_at=None):
+        """Abstract class for creating an arbitrary tokenizer to convert string to list of int tokens.
+        Args:
+            tokens: List of tokens.
+            pad: Pad token as string.
+            blank: Blank token as string.
+            oov: OOV token as string.
+            sep: Separation token as string.
+            add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
+                if None then no blank in labels.
+        """
+        super().__init__()
+        tokens = list(tokens)
+        # TODO @xueyang: in general, IDs of pad, sil, blank, and oov are preserved ahead instead of dynamically
+        #  assigned according to the number of tokens. The downside of using dynamical assignment leads to different
+        #  IDs for each.
+        self.pad, tokens = len(tokens), tokens + [pad]  # Padding
+        if add_blank_at is not None:
+            self.blank, tokens = len(tokens), tokens + [blank]  # Reserved for blank from asr-model
+        else:
+            # use add_blank_at=None only for ASR where blank is added automatically, disable blank here
+            self.blank = None
+        self.oov, tokens = len(tokens), tokens + [oov]  # Out Of Vocabulary
+        if add_blank_at == "last":
+            tokens[-1], tokens[-2] = tokens[-2], tokens[-1]
+            self.oov, self.blank = self.blank, self.oov
+        self.tokens = tokens
+        self.sep = sep
+        self._util_ids = {self.pad, self.blank, self.oov}
+        self._token2id = {l: i for i, l in enumerate(tokens)}
+        self._id2token = tokens
+    def __call__(self, text: str) -> List[int]:
+        return self.encode(text)
+    @abstractmethod
+    def encode(self, text: str) -> List[int]:
+        """Turns str text into int tokens."""
+        pass
+    def decode(self, tokens: List[int]) -> str:
+        """Turns ints tokens into str text."""
+        return self.sep.join(self._id2token[t] for t in tokens if t not in self._util_ids)
+class GermanCharsTokenizer(BaseTokenizer):
+    _PUNCT_LIST = ['!', '"', '(', ')', ',', '-', '.', '/', ':', ';', '?', '[', ']', '{', '}', '«', '»', '‒', '–', '—', '‘', '‚', '“', '„', '‹', '›']
+    _CHARSET_STR = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜẞabcdefghijklmnopqrstuvwxyzäöüß'
+    PUNCT_LIST = (
+        ',', '.', '!', '?', '-',
+        ':', ';', '/', '"', '(',
+        ')', '[', ']', '{', '}',
+    )
+    def __init__(
+        self,
+        chars=_CHARSET_STR,
+        punct=True,
+        apostrophe=True,
+        add_blank_at=None,
+        pad_with_space=True,
+        non_default_punct_list=_PUNCT_LIST,
+        text_preprocessing_func=any_locale_text_preprocessing,
+    ):
+        tokens = []
+        self.space, tokens = len(tokens), tokens + [' ']  # Space
+        tokens.extend(chars)
+        if apostrophe:
+            tokens.append("'")  # Apostrophe for saving "don't" and "Joe's"
+        if punct:
+            if non_default_punct_list is not None:
+                self.PUNCT_LIST = non_default_punct_list
+            tokens.extend(self.PUNCT_LIST)
+        super().__init__(tokens, add_blank_at=add_blank_at)
+        self.punct = punct
+        self.pad_with_space = pad_with_space
+        self.text_preprocessing_func = text_preprocessing_func
+    def encode(self, text):
+        """See base class."""
+        cs, space, tokens = [], self.tokens[self.space], set(self.tokens)
+        text = self.text_preprocessing_func(text)
+        for c in text:
+            # Add a whitespace if the current char is a whitespace while the previous char is not a whitespace.
+            if c == space and len(cs) > 0 and cs[-1] != space:
+                cs.append(c)
+            # Add the current char that is an alphanumeric or an apostrophe.
+            elif (c.isalnum() or c == "'") and c in tokens:
+                cs.append(c)
+            # Add a punctuation that has a single char.
+            elif (c in self.PUNCT_LIST) and self.punct:
+                cs.append(c)
+            # Warn about unknown char
+            elif c != space:
+                logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
+        # Remove trailing spaces
+        if cs:
+            while cs[-1] == space:
+                cs.pop()
+        if self.pad_with_space:
+            cs = [space] + cs + [space]
+        return [self._token2id[p] for p in cs]

requirements.txt ADDED Viewed

	@@ -0,0 +1,61 @@

+aiofiles==24.1.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.11.0
+brotli==1.2.0
+certifi==2025.11.12
+click==8.3.1
+coloredlogs==15.0.1
+fastapi==0.121.3
+ffmpy==1.0.0
+filelock==3.20.0
+flatbuffers==25.9.23
+fsspec==2025.10.0
+gradio==5.50.0
+gradio-client==1.14.0
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==1.1.5
+humanfriendly==10.0
+idna==3.11
+jinja2==3.1.6
+markdown-it-py==4.0.0
+markupsafe==3.0.3
+mdurl==0.1.2
+mpmath==1.3.0
+numpy==2.3.5
+onnxruntime==1.23.2
+orjson==3.11.4
+packaging==25.0
+pandas==2.3.3
+pillow==11.3.0
+protobuf==6.33.1
+pydantic==2.12.3
+pydantic-core==2.41.4
+pydub==0.25.1
+pygments==2.19.2
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+pyyaml==6.0.3
+rich==14.2.0
+ruff==0.14.6
+safehttpx==0.1.7
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+starlette==0.50.0
+sympy==1.14.0
+tomlkit==0.13.3
+tqdm==4.67.1
+typer==0.20.0
+typer-slim==0.20.0
+typing-extensions==4.15.0
+typing-inspection==0.4.2
+tzdata==2025.2
+uvicorn==0.38.0
+websockets==15.0.1