Warholt commited on
Commit
a1e382b
·
1 Parent(s): f3d1bee

Add app, track models via lfs

Browse files
Files changed (4) hide show
  1. .gitattributes +2 -0
  2. app.py +148 -0
  3. char_tokenizers.py +140 -0
  4. requirements.txt +61 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.onnx filter=lfs diff=lfs merge=lfs -text
37
+
app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import onnxruntime as ort
3
+ import numpy as np
4
+ from char_tokenizers import GermanCharsTokenizer
5
+
6
+ # Initialize tokenizer
7
+ TOKENIZER = GermanCharsTokenizer()
8
+
9
+ # Model paths
10
+ MODELS = {
11
+ "Caro": {
12
+ "fastpitch": "onnx/caro_fastpitch.onnx",
13
+ "hifigan": "onnx/caro_hifigan.onnx",
14
+ },
15
+ "Karlsson": {
16
+ "fastpitch": "onnx/karlsson_fastpitch.onnx",
17
+ "hifigan": "onnx/karlsson_hifigan.onnx",
18
+ },
19
+ }
20
+
21
+ # Load models
22
+ print("Loading ONNX models...")
23
+ sessions = {}
24
+ for voice_name, paths in MODELS.items():
25
+ print(f"Loading {voice_name}...")
26
+ sessions[voice_name] = {
27
+ "fastpitch": ort.InferenceSession(paths["fastpitch"]),
28
+ "hifigan": ort.InferenceSession(paths["hifigan"]),
29
+ }
30
+ print("Models loaded successfully!")
31
+
32
+
33
+ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
34
+ """
35
+ Synthesize speech from text using the selected voice.
36
+
37
+ Args:
38
+ text: Input text to synthesize
39
+ voice: Voice to use (Caro or Karlsson)
40
+ pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
41
+
42
+ Returns:
43
+ Tuple of (sample_rate, audio_array)
44
+ """
45
+ if not text.strip():
46
+ return None
47
+
48
+ # Tokenize text
49
+ tokens = TOKENIZER.encode(text)
50
+
51
+ # Prepare inputs for FastPitch
52
+ paces = np.zeros(len(tokens), dtype=np.float32) + pace
53
+ pitches = np.zeros(len(tokens), dtype=np.float32) # Keep pitch at 0.0
54
+
55
+ inputs = {
56
+ "text": np.array([tokens], dtype=np.int64),
57
+ "pace": np.array([paces], dtype=np.float32),
58
+ "pitch": np.array([pitches], dtype=np.float32),
59
+ }
60
+
61
+ # Generate spectrogram with FastPitch
62
+ fastpitch_session = sessions[voice]["fastpitch"]
63
+ spec = fastpitch_session.run(None, inputs)[0]
64
+
65
+ # Generate audio with HiFiGAN
66
+ hifigan_session = sessions[voice]["hifigan"]
67
+ gan_inputs = {"spec": spec}
68
+ audio = hifigan_session.run(None, gan_inputs)[0]
69
+
70
+ # Return sample rate and audio
71
+ sample_rate = 44100
72
+ audio_array = audio.squeeze()
73
+
74
+ return (sample_rate, audio_array)
75
+
76
+
77
+ # Create Gradio interface
78
+ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
79
+ gr.Markdown(
80
+ """
81
+ # 🎙️ German Text-to-Speech
82
+
83
+ Generate German speech using two different voices: **Caro** and **Karlsson**.
84
+
85
+ Enter your German text below and select a voice to synthesize speech.
86
+ """
87
+ )
88
+
89
+ with gr.Row():
90
+ with gr.Column():
91
+ text_input = gr.Textbox(
92
+ label="Text to synthesize",
93
+ placeholder="Geben Sie hier Ihren deutschen Text ein...",
94
+ lines=5,
95
+ value="Hallo! Willkommen zur deutschen Sprachsynthese.",
96
+ )
97
+
98
+ voice_dropdown = gr.Dropdown(
99
+ choices=list(MODELS.keys()), label="Voice", value="Karlsson"
100
+ )
101
+
102
+ pace_slider = gr.Slider(
103
+ minimum=0.5,
104
+ maximum=2.0,
105
+ value=1.0,
106
+ step=0.1,
107
+ label="Speaking Rate",
108
+ info="1.0 is normal speed",
109
+ )
110
+
111
+ generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
112
+
113
+ with gr.Column():
114
+ audio_output = gr.Audio(label="Generated Audio", type="numpy")
115
+
116
+ gr.Examples(
117
+ examples=[
118
+ ["Guten Tag! Wie geht es Ihnen heute?", "Caro", 1.0],
119
+ [
120
+ "Die Wissenschaft hat in den letzten Jahren große Fortschritte gemacht.",
121
+ "Karlsson",
122
+ 1.0,
123
+ ],
124
+ [
125
+ "Es war einmal ein kleines Mädchen, das durch den Wald spazierte.",
126
+ "Caro",
127
+ 0.9,
128
+ ],
129
+ [
130
+ "Berlin ist die Hauptstadt und zugleich ein Land der Bundesrepublik Deutschland.",
131
+ "Karlsson",
132
+ 1.0,
133
+ ],
134
+ ],
135
+ inputs=[text_input, voice_dropdown, pace_slider],
136
+ outputs=audio_output,
137
+ fn=synthesize_speech,
138
+ cache_examples=False,
139
+ )
140
+
141
+ generate_btn.click(
142
+ fn=synthesize_speech,
143
+ inputs=[text_input, voice_dropdown, pace_slider],
144
+ outputs=audio_output,
145
+ )
146
+
147
+ if __name__ == "__main__":
148
+ demo.launch()
char_tokenizers.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List
3
+ import unicodedata
4
+ from abc import ABC, abstractmethod
5
+
6
+ def normalize_unicode_text(text: str) -> str:
7
+ if not unicodedata.is_normalized("NFC", text):
8
+ text = unicodedata.normalize("NFC", text)
9
+
10
+ return text
11
+
12
+ def any_locale_text_preprocessing(text: str) -> str:
13
+ res = []
14
+ for c in normalize_unicode_text(text):
15
+ if c in ['’']:
16
+ res.append("'")
17
+ else:
18
+ res.append(c)
19
+
20
+ return ''.join(res)
21
+
22
+ class BaseTokenizer(ABC):
23
+ PAD, BLANK, OOV = '<pad>', '<blank>', '<oov>'
24
+
25
+ def __init__(self, tokens, *, pad=PAD, blank=BLANK, oov=OOV, sep='', add_blank_at=None):
26
+ """Abstract class for creating an arbitrary tokenizer to convert string to list of int tokens.
27
+ Args:
28
+ tokens: List of tokens.
29
+ pad: Pad token as string.
30
+ blank: Blank token as string.
31
+ oov: OOV token as string.
32
+ sep: Separation token as string.
33
+ add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
34
+ if None then no blank in labels.
35
+ """
36
+ super().__init__()
37
+
38
+ tokens = list(tokens)
39
+ # TODO @xueyang: in general, IDs of pad, sil, blank, and oov are preserved ahead instead of dynamically
40
+ # assigned according to the number of tokens. The downside of using dynamical assignment leads to different
41
+ # IDs for each.
42
+ self.pad, tokens = len(tokens), tokens + [pad] # Padding
43
+
44
+ if add_blank_at is not None:
45
+ self.blank, tokens = len(tokens), tokens + [blank] # Reserved for blank from asr-model
46
+ else:
47
+ # use add_blank_at=None only for ASR where blank is added automatically, disable blank here
48
+ self.blank = None
49
+
50
+ self.oov, tokens = len(tokens), tokens + [oov] # Out Of Vocabulary
51
+
52
+ if add_blank_at == "last":
53
+ tokens[-1], tokens[-2] = tokens[-2], tokens[-1]
54
+ self.oov, self.blank = self.blank, self.oov
55
+
56
+ self.tokens = tokens
57
+ self.sep = sep
58
+
59
+ self._util_ids = {self.pad, self.blank, self.oov}
60
+ self._token2id = {l: i for i, l in enumerate(tokens)}
61
+ self._id2token = tokens
62
+
63
+ def __call__(self, text: str) -> List[int]:
64
+ return self.encode(text)
65
+
66
+ @abstractmethod
67
+ def encode(self, text: str) -> List[int]:
68
+ """Turns str text into int tokens."""
69
+ pass
70
+
71
+ def decode(self, tokens: List[int]) -> str:
72
+ """Turns ints tokens into str text."""
73
+ return self.sep.join(self._id2token[t] for t in tokens if t not in self._util_ids)
74
+
75
+
76
+ class GermanCharsTokenizer(BaseTokenizer):
77
+ _PUNCT_LIST = ['!', '"', '(', ')', ',', '-', '.', '/', ':', ';', '?', '[', ']', '{', '}', '«', '»', '‒', '–', '—', '‘', '‚', '“', '„', '‹', '›']
78
+ _CHARSET_STR = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜẞabcdefghijklmnopqrstuvwxyzäöüß'
79
+ PUNCT_LIST = (
80
+ ',', '.', '!', '?', '-',
81
+ ':', ';', '/', '"', '(',
82
+ ')', '[', ']', '{', '}',
83
+ )
84
+
85
+ def __init__(
86
+ self,
87
+ chars=_CHARSET_STR,
88
+ punct=True,
89
+ apostrophe=True,
90
+ add_blank_at=None,
91
+ pad_with_space=True,
92
+ non_default_punct_list=_PUNCT_LIST,
93
+ text_preprocessing_func=any_locale_text_preprocessing,
94
+ ):
95
+ tokens = []
96
+ self.space, tokens = len(tokens), tokens + [' '] # Space
97
+ tokens.extend(chars)
98
+ if apostrophe:
99
+ tokens.append("'") # Apostrophe for saving "don't" and "Joe's"
100
+
101
+ if punct:
102
+ if non_default_punct_list is not None:
103
+ self.PUNCT_LIST = non_default_punct_list
104
+ tokens.extend(self.PUNCT_LIST)
105
+
106
+ super().__init__(tokens, add_blank_at=add_blank_at)
107
+
108
+ self.punct = punct
109
+ self.pad_with_space = pad_with_space
110
+
111
+ self.text_preprocessing_func = text_preprocessing_func
112
+
113
+ def encode(self, text):
114
+ """See base class."""
115
+ cs, space, tokens = [], self.tokens[self.space], set(self.tokens)
116
+
117
+ text = self.text_preprocessing_func(text)
118
+ for c in text:
119
+ # Add a whitespace if the current char is a whitespace while the previous char is not a whitespace.
120
+ if c == space and len(cs) > 0 and cs[-1] != space:
121
+ cs.append(c)
122
+ # Add the current char that is an alphanumeric or an apostrophe.
123
+ elif (c.isalnum() or c == "'") and c in tokens:
124
+ cs.append(c)
125
+ # Add a punctuation that has a single char.
126
+ elif (c in self.PUNCT_LIST) and self.punct:
127
+ cs.append(c)
128
+ # Warn about unknown char
129
+ elif c != space:
130
+ logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
131
+
132
+ # Remove trailing spaces
133
+ if cs:
134
+ while cs[-1] == space:
135
+ cs.pop()
136
+
137
+ if self.pad_with_space:
138
+ cs = [space] + cs + [space]
139
+
140
+ return [self._token2id[p] for p in cs]
requirements.txt ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ annotated-doc==0.0.4
3
+ annotated-types==0.7.0
4
+ anyio==4.11.0
5
+ brotli==1.2.0
6
+ certifi==2025.11.12
7
+ click==8.3.1
8
+ coloredlogs==15.0.1
9
+ fastapi==0.121.3
10
+ ffmpy==1.0.0
11
+ filelock==3.20.0
12
+ flatbuffers==25.9.23
13
+ fsspec==2025.10.0
14
+ gradio==5.50.0
15
+ gradio-client==1.14.0
16
+ groovy==0.1.2
17
+ h11==0.16.0
18
+ hf-xet==1.2.0
19
+ httpcore==1.0.9
20
+ httpx==0.28.1
21
+ huggingface-hub==1.1.5
22
+ humanfriendly==10.0
23
+ idna==3.11
24
+ jinja2==3.1.6
25
+ markdown-it-py==4.0.0
26
+ markupsafe==3.0.3
27
+ mdurl==0.1.2
28
+ mpmath==1.3.0
29
+ numpy==2.3.5
30
+ onnxruntime==1.23.2
31
+ orjson==3.11.4
32
+ packaging==25.0
33
+ pandas==2.3.3
34
+ pillow==11.3.0
35
+ protobuf==6.33.1
36
+ pydantic==2.12.3
37
+ pydantic-core==2.41.4
38
+ pydub==0.25.1
39
+ pygments==2.19.2
40
+ python-dateutil==2.9.0.post0
41
+ python-multipart==0.0.20
42
+ pytz==2025.2
43
+ pyyaml==6.0.3
44
+ rich==14.2.0
45
+ ruff==0.14.6
46
+ safehttpx==0.1.7
47
+ semantic-version==2.10.0
48
+ shellingham==1.5.4
49
+ six==1.17.0
50
+ sniffio==1.3.1
51
+ starlette==0.50.0
52
+ sympy==1.14.0
53
+ tomlkit==0.13.3
54
+ tqdm==4.67.1
55
+ typer==0.20.0
56
+ typer-slim==0.20.0
57
+ typing-extensions==4.15.0
58
+ typing-inspection==0.4.2
59
+ tzdata==2025.2
60
+ uvicorn==0.38.0
61
+ websockets==15.0.1