| text_norm: | |
| postprocess: | |
| - # EN2CN | |
| "…" : "。" | |
| "!" : "!" | |
| "\\?" : "?" | |
| ";" : ";" | |
| ":" : ":" | |
| "," : "," | |
| "\\(" : "(" | |
| "\\)" : ")" | |
| - # EN2CN | |
| "……": "。" | |
| - # OTHER2CN | |
| "﹐" : "," | |
| "﹔" : ";" | |
| "。" : "。" | |
| # CN2CN | |
| ";" : "。" | |
| ":" : "," | |
| "、" : "," | |
| - # 处理连续句号"。" | |
| "。+": "。" | |
| - # 正则后的 "/" | |
| "/": "每" | |
| - # 处理_ | |
| "_": " " | |
| - # 处理正则后的[~~]+,根据是否在句尾替换为“。”或“至” | |
| "~+": "~" | |
| "~+": "~" | |
| "[~~]": "。" | |
| - # 删除除英文内的“-”, "'" | |
| "(?<=[^a-zA-Z])[-']+": "," | |
| "[-']+(?=[^a-zA-Z])": "," | |
| - # 删除除了标准中文标点、英文、-、’、空格、数字、中文外的其他符号 | |
| "[^。!?,\u4e00-\u4E27\u4E29-\u4E3E\u4E42-\u9fa4a-zA-Z ]": "" | |
| - # 处理连续逗号"。" | |
| ",+": "," | |
| - # 处理连续空格"。" | |
| " +": " " | |
| split_token: ["。", ","] | |
| split_cn_length: null |