File size: 1,122 Bytes
81a8221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
text_norm:
  postprocess:
    - # EN2CN
      "…" : "。"
      "!" : "!"
      "\\?" : "?"
      ";" : ";"
      ":" : ":"
      "," : ","
      "\\(" : "("
      "\\)" : ")"
    - # EN2CN
      "……": "。"
    - # OTHER2CN
      "﹐" : ","
      "﹔" : ";"
      "。"  : "。"
      # CN2CN
      ";" : "。"
      ":" : ","
      "、" : ","
    - # 处理连续句号"。"
      "。+": "。"   
    - # 正则后的 "/"
      "/": "每"
    - # 处理_
      "_": " "            
    - # 处理正则后的[~~]+,根据是否在句尾替换为“。”或“至”
      "~+": "~"
      "~+": "~"
      "[~~]": "。"
    - # 删除除英文内的“-”, "'"
      "(?<=[^a-zA-Z])[-']+": ","
      "[-']+(?=[^a-zA-Z])": ","
    - # 删除除了标准中文标点、英文、-、’、空格、数字、中文外的其他符号
      "[^。!?,\u4e00-\u4E27\u4E29-\u4E3E\u4E42-\u9fa4a-zA-Z ]": ""
    - # 处理连续逗号"。"
      ",+": ","   
    - # 处理连续空格"。"
      " +": " "  
             
split_token: ["。", ","]
split_cn_length: null