txya900619 commited on
Commit
31852bc
·
1 Parent(s): 5e38562

feat: change layout for ithuan demo page

Browse files
Files changed (3) hide show
  1. DEMO.md +3 -13
  2. app.py +140 -110
  3. configs/models.yaml +0 -4
DEMO.md CHANGED
@@ -2,16 +2,6 @@
2
 
3
  ILRDF Formosan Text-To-Speech System
4
 
5
- ## 研發團隊
6
-
7
- - [李鴻欣 Hung-Shin Lee](mailto:[email protected])
8
- - [陳力瑋 Li-Wei Chen](mailto:[email protected])
9
- - [意傳科技](https://ithuan.tw/)
10
- - [原住民族語言研究發展基金會](https://www.ilrdf.org.tw/)
11
-
12
- ## 特別致謝
13
- - [聯和科創](https://www.104.com.tw/company/1a2x6bmu75)
14
- - [Pipalofasaran to Sowal no Pangcah/'Amis 台灣阿美族語言永續發展學會](https://www.facebook.com/groups/ypspt/about)
15
- - [台灣太魯閣族語言發展學會](https://qkktt.com/)
16
- - [台灣原住民族賽德克族語言文化學會](https://www.facebook.com/3S3TBL/)
17
- - 族語老師們
 
2
 
3
  ILRDF Formosan Text-To-Speech System
4
 
5
+ \
6
+ 本系統為初步開發成果的試用版本,仍處於**測試階段**。**合成結果可能在發音、語調或流暢度存在不盡理想之處,甚至可能出現錯誤**。
7
+ 我們誠摯邀請您試用本系統,並請務必謹慎**檢視合成結果**,切勿直接作為正式或關鍵資訊使用,感謝您的理解與支持。
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -194,94 +194,107 @@ demo = gr.Blocks(
194
  "sans-serif",
195
  )
196
  ),
 
 
 
 
 
 
 
 
 
 
197
  )
198
 
199
  with demo:
200
  with open("DEMO.md") as tong:
201
  gr.Markdown(tong.read())
202
 
203
- with gr.Row():
204
- with gr.Column():
205
- model_drop_down = gr.Dropdown(
206
- models_config.keys(),
207
- value=DEFAULT_MODEL_ID,
208
- label="模型",
209
- )
210
-
211
- language = gr.Dropdown(
212
- choices=g2p_object.keys(),
213
- label="語言",
214
- value="阿美_秀姑巒",
215
- )
216
-
217
- ref_audio_input = gr.Audio(
218
- type="filepath",
219
- waveform_options=gr.WaveformOptions(
220
- sample_rate=24000,
221
- ),
222
- label="Reference Audio",
223
- )
224
- ref_text_input = gr.Textbox(
225
- value="",
226
- label="Reference Text",
227
- )
228
-
229
- gen_text_input = gr.Textbox(
230
- label="Text to Generate",
231
- value="",
232
- )
233
-
234
- generate_btn = gr.Button("Synthesize", variant="primary")
235
-
236
- with gr.Accordion("Advanced Settings", open=False):
237
- remove_silence = gr.Checkbox(
238
- label="Remove Silences",
239
- info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
240
- value=False,
 
 
 
241
  )
242
- speed_slider = gr.Slider(
243
- label="Speed",
244
- minimum=0.3,
245
- maximum=2.0,
246
- value=1.0,
247
- step=0.1,
248
- info="語速(越小越慢)",
249
  )
250
- nfe_slider = gr.Slider(
251
- label="NFE Steps",
252
- minimum=4,
253
- maximum=64,
254
- value=32,
255
- step=2,
256
- info="Set the number of denoising steps.",
257
  )
258
- cross_fade_duration_slider = gr.Slider(
259
- label="Cross-Fade Duration (s)",
260
- minimum=0.0,
261
- maximum=1.0,
262
- value=0.15,
263
- step=0.01,
264
- info="Set the duration of the cross-fade between audio clips.",
265
  )
266
- with gr.Column():
267
- audio_output = gr.Audio(label="Synthesized Audio")
268
- spectrogram_output = gr.Image(label="Spectrogram")
 
 
 
 
 
 
 
 
 
 
 
269
 
270
  @gpu_decorator
271
- def basic_tts(
272
- model_drop_down: str,
273
  language: str,
274
- ref_audio_input: str,
275
- ref_text_input: str,
276
  gen_text_input: str,
277
- remove_silence: bool,
278
- cross_fade_duration_slider: float,
279
- nfe_slider: int,
280
- speed_slider: float,
281
  ):
282
- ref_text_input = ref_text_input.strip()
283
- if len(ref_text_input) == 0:
284
- raise gr.Error("請勿輸入空字串。")
285
 
286
  gen_text_input = gen_text_input.strip()
287
  if len(gen_text_input) == 0:
@@ -301,53 +314,70 @@ with demo:
301
  ref_audio_input,
302
  ref_text_input,
303
  gen_text_input,
304
- models_config[model_drop_down],
305
- remove_silence,
306
- cross_fade_duration=cross_fade_duration_slider,
307
- nfe_step=nfe_slider,
308
- speed=speed_slider,
309
  )
310
- return audio_out, spectrogram_path
311
 
312
- generate_btn.click(
313
- basic_tts,
314
  inputs=[
315
- model_drop_down,
316
- language,
317
- ref_audio_input,
318
- ref_text_input,
319
- gen_text_input,
320
- remove_silence,
321
- cross_fade_duration_slider,
322
- nfe_slider,
323
- speed_slider,
324
  ],
325
- outputs=[audio_output, spectrogram_output],
326
  )
327
 
328
- def get_examples_by_language(language):
329
- if language not in examples_config:
330
- return []
331
- return [
332
- [refs_config[ex["ref"]]["wav"], refs_config[ex["ref"]]["text"], ex["text"]]
333
- for ex in examples_config[language]
334
- ]
335
-
336
- examples = gr.Examples(
337
- get_examples_by_language(language.value),
338
- label="範例",
339
- inputs=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  ref_audio_input,
341
  ref_text_input,
342
  gen_text_input,
343
- ],
344
- )
 
345
 
346
- language.change(
347
- lambda language: gr.Dataset(samples=get_examples_by_language(language)),
348
- language,
349
- examples.dataset,
 
 
 
 
 
350
  )
351
 
352
-
353
  demo.launch()
 
194
  "sans-serif",
195
  )
196
  ),
197
+ js="""
198
+ function addButtonsEvent() {
199
+ const buttons = document.querySelectorAll("#head-html-block button");
200
+ buttons.forEach(button => {
201
+ button.addEventListener("click", () => {
202
+ navigator.clipboard.writeText(button.innerText);
203
+ });
204
+ });
205
+ }
206
+ """,
207
  )
208
 
209
  with demo:
210
  with open("DEMO.md") as tong:
211
  gr.Markdown(tong.read())
212
 
213
+ gr.HTML(
214
+ "特殊符號請複製使用:<button>é</button> <button>ṟ</button> <button>ɨ</button> <button>ʉ</button>",
215
+ padding=False,
216
+ elem_id="head-html-block",
217
+ )
218
+
219
+ with gr.Tab("預設語者"):
220
+ with gr.Row():
221
+ with gr.Column():
222
+ default_speaker_language = gr.Dropdown(
223
+ choices=g2p_object.keys(),
224
+ label="選擇語言",
225
+ value="阿美_秀姑巒",
226
+ )
227
+
228
+ def get_refs_by_language(language: str):
229
+ return [r for r in refs_config.keys() if r.startswith(language)]
230
+
231
+ default_speaker_refs = gr.Dropdown(
232
+ choices=get_refs_by_language(default_speaker_language.value),
233
+ label="選擇配音員",
234
+ value=get_refs_by_language(default_speaker_language.value)[0],
235
+ )
236
+
237
+ default_speaker_gen_text_input = gr.Textbox(
238
+ label="輸入文字(上限 300 字元)",
239
+ value="",
240
+ )
241
+
242
+ default_speaker_generate_btn = gr.Button("開始合成", variant="primary")
243
+
244
+ with gr.Column():
245
+ default_speaker_audio_output = gr.Audio(label="合成結果")
246
+
247
+ with gr.Tab("自定義語者"):
248
+ with gr.Row():
249
+ with gr.Column():
250
+ custom_speaker_language = gr.Dropdown(
251
+ choices=g2p_object.keys(),
252
+ label="選擇語言",
253
+ value="阿美_秀姑巒",
254
  )
255
+
256
+ custom_speaker_ref_text_input = gr.Textbox(
257
+ value=refs_config.get(f"{custom_speaker_language.value}_1", {}).get(
258
+ "text", ""
259
+ ),
260
+ label="錄製下方句子或上傳與句子相符的音檔",
 
261
  )
262
+
263
+ custom_speaker_audio_input = gr.Audio(
264
+ type="filepath",
265
+ waveform_options=gr.WaveformOptions(
266
+ sample_rate=24000,
267
+ ),
268
+ label="錄製或上傳",
269
  )
270
+
271
+ custom_speaker_gen_text_input = gr.Textbox(
272
+ label="輸入合���文字(上限 300 字元)",
273
+ value="",
 
 
 
274
  )
275
+
276
+ custom_speaker_generate_btn = gr.Button("開始合成", variant="primary")
277
+
278
+ with gr.Column():
279
+ custom_speaker_audio_output = gr.Audio(label="合成結果")
280
+
281
+ default_speaker_language.change(
282
+ lambda lang: gr.Dropdown(
283
+ choices=get_refs_by_language(lang),
284
+ value=get_refs_by_language(lang)[0],
285
+ ),
286
+ inputs=[default_speaker_language],
287
+ outputs=[default_speaker_refs],
288
+ )
289
 
290
  @gpu_decorator
291
+ def default_speaker_tts(
 
292
  language: str,
293
+ ref: str,
 
294
  gen_text_input: str,
 
 
 
 
295
  ):
296
+ ref_text_input = refs_config[ref]["text"]
297
+ ref_audio_input = refs_config[ref]["wav"]
 
298
 
299
  gen_text_input = gen_text_input.strip()
300
  if len(gen_text_input) == 0:
 
314
  ref_audio_input,
315
  ref_text_input,
316
  gen_text_input,
317
+ models_config[DEFAULT_MODEL_ID],
 
 
 
 
318
  )
319
+ return audio_out
320
 
321
+ default_speaker_generate_btn.click(
322
+ default_speaker_tts,
323
  inputs=[
324
+ default_speaker_language,
325
+ default_speaker_refs,
326
+ default_speaker_gen_text_input,
 
 
 
 
 
 
327
  ],
328
+ outputs=[default_speaker_audio_output],
329
  )
330
 
331
+ custom_speaker_language.change(
332
+ lambda lang: gr.Textbox(
333
+ value=refs_config.get(f"{lang}_1", {}).get("text", ""),
334
+ ),
335
+ inputs=[custom_speaker_language],
336
+ outputs=[custom_speaker_ref_text_input],
337
+ )
338
+
339
+ @gpu_decorator
340
+ def custom_speaker_tts(
341
+ language: str,
342
+ ref_text_input: str,
343
+ ref_audio_input: str,
344
+ gen_text_input: str,
345
+ ):
346
+ ref_text_input = ref_text_input.strip()
347
+ if len(ref_text_input) == 0:
348
+ raise gr.Error("請勿輸入空字串。")
349
+
350
+ gen_text_input = gen_text_input.strip()
351
+ if len(gen_text_input) == 0:
352
+ raise gr.Error("請勿輸入空字串。")
353
+
354
+ ignore_punctuation = False
355
+ ipa_with_ng = False
356
+
357
+ ref_text_input = text_to_ipa(
358
+ ref_text_input, language, ignore_punctuation, ipa_with_ng
359
+ )
360
+ gen_text_input = text_to_ipa(
361
+ gen_text_input, language, ignore_punctuation, ipa_with_ng
362
+ )
363
+
364
+ audio_out, spectrogram_path = infer(
365
  ref_audio_input,
366
  ref_text_input,
367
  gen_text_input,
368
+ models_config[DEFAULT_MODEL_ID],
369
+ )
370
+ return audio_out
371
 
372
+ custom_speaker_generate_btn.click(
373
+ custom_speaker_tts,
374
+ inputs=[
375
+ custom_speaker_language,
376
+ custom_speaker_ref_text_input,
377
+ custom_speaker_audio_input,
378
+ custom_speaker_gen_text_input,
379
+ ],
380
+ outputs=[custom_speaker_audio_output],
381
  )
382
 
 
383
  demo.launch()
configs/models.yaml CHANGED
@@ -1,5 +1 @@
1
  all-formosan-v2-step-843031: ${load_f5tts:hf://ithuan/f5-tts-formosan-all-finetune-v2/model_843031.safetensors,hf://ithuan/f5-tts-formosan-all-finetune-v2/vocab.txt,false,false}
2
- all-formosan-step-1081600: ${load_f5tts:hf://ithuan/f5-tts-formosan-all-finetune/model_1081600.safetensors,hf://ithuan/f5-tts-formosan-all-finetune/vocab.txt,false,false}
3
- all-formosan-step-811200: ${load_f5tts:hf://ithuan/f5-tts-formosan-all-finetune/model_811200.safetensors,hf://ithuan/f5-tts-formosan-all-finetune/vocab.txt,false,false}
4
- all-formosan-step-432640: ${load_f5tts:hf://ithuan/f5-tts-formosan-all-finetune/model_432640.safetensors,hf://ithuan/f5-tts-formosan-all-finetune/vocab.txt,false,false}
5
- all-with-trv-step-254016: ${load_f5tts:hf://united-link/f5-tts-ami-finetune-with-ithuan-trv/model_254016.safetensors,hf://united-link/f5-tts-ami-finetune-with-ithuan-trv/vocab.txt,true,true}
 
1
  all-formosan-v2-step-843031: ${load_f5tts:hf://ithuan/f5-tts-formosan-all-finetune-v2/model_843031.safetensors,hf://ithuan/f5-tts-formosan-all-finetune-v2/vocab.txt,false,false}