txya900619's picture
feat: change ipa parser to support ŋ
443b650
raw
history blame
1.65 kB
import re
import gradio as gr
from omegaconf import OmegaConf
g2p_config = OmegaConf.load("configs/g2p.yaml")
g2p_object = OmegaConf.to_object(g2p_config)
XIUGULUAN_G2P = g2p_object["g2p"]["阿美_秀姑巒"]
def text_to_ipa(text: str, ignore_punctuation=False, ipa_with_ng=False) -> str:
text = text.lower()
text = text.replace("'", "’")
text = re.sub(r"\s+", " ", text) # remove extra spaces
words = text.split() # change in future
print(f"text: {words}")
ipa = []
unknown_chars = set()
extended_g2p = {**XIUGULUAN_G2P, ",": ",", ".": ".", "?": "?", "!": "!"}
extended_g2p_sorted_keys = sorted(extended_g2p.keys(), key=len, reverse=True)
for word in words:
unknown_char = word
converted_word = word
for key in extended_g2p_sorted_keys:
unknown_char = unknown_char.replace(key, "")
converted_word = converted_word.replace(key, extended_g2p[key])
if len(unknown_char) > 0: # If there are unknown characters
unknown_chars.update(set(unknown_char))
continue
ipa.append(converted_word)
if len(unknown_chars) > 0:
raise gr.Error(
f"Unknown characters: {', '.join(unknown_chars)}. Please remove them and try again."
)
ipa = (
" ".join(ipa)
.replace("g", "ɡ")
.replace("ʦ", "t͡s")
.replace("ʨ", "t͡ɕ")
.replace("R", "ʀ")
.replace("ʤ", "dʒ")
)
if ignore_punctuation:
ipa = re.sub(r"[.?!,]", "", ipa)
if ipa_with_ng:
ipa = ipa.replace("ŋ", "nɡ")
print(f"ipa: {ipa}")
return ipa