Add en_normalization and fix LangSegmenter (#2062)

main
KamioRinn 6 months ago committed by GitHub
parent c70daefea2
commit c17dd642c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -7,7 +7,7 @@ sys.path.append(now_dir)
import re import re
import torch import torch
import LangSegment from text.LangSegmenter import LangSegmenter
from text import chinese from text import chinese
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from text.cleaner import clean_text from text.cleaner import clean_text
@ -20,7 +20,7 @@ from tools.i18n.i18n import I18nAuto, scan_language_list
language=os.environ.get("language","Auto") language=os.environ.get("language","Auto")
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
i18n = I18nAuto(language=language) i18n = I18nAuto(language=language)
punctuation = set(['!', '?', '', ',', '.', '-'," "]) punctuation = set(['!', '?', '', ',', '.', '-'])
def get_first(text:str) -> str: def get_first(text:str) -> str:
pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]" pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]"
@ -119,12 +119,7 @@ class TextPreprocessor:
def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False): def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False):
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
language = language.replace("all_","") language = language.replace("all_","")
if language == "en": formattext = text
LangSegment.setfilters(["en"])
formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text))
else:
# 因无法区别中日韩文汉字,以用户输入为准
formattext = text
while " " in formattext: while " " in formattext:
formattext = formattext.replace(" ", " ") formattext = formattext.replace(" ", " ")
if language == "zh": if language == "zh":
@ -148,19 +143,18 @@ class TextPreprocessor:
elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}:
textlist=[] textlist=[]
langlist=[] langlist=[]
LangSegment.setfilters(["zh","ja","en","ko"])
if language == "auto": if language == "auto":
for tmp in LangSegment.getTexts(text): for tmp in LangSegmenter.getTexts(text):
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) textlist.append(tmp["text"])
elif language == "auto_yue": elif language == "auto_yue":
for tmp in LangSegment.getTexts(text): for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "zh": if tmp["lang"] == "zh":
tmp["lang"] = "yue" tmp["lang"] = "yue"
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
textlist.append(tmp["text"]) textlist.append(tmp["text"])
else: else:
for tmp in LangSegment.getTexts(text): for tmp in LangSegmenter.getTexts(text):
if tmp["lang"] == "en": if tmp["lang"] == "en":
langlist.append(tmp["lang"]) langlist.append(tmp["lang"])
else: else:

@ -135,7 +135,7 @@ def cut3(inp):
@register_method("cut4") @register_method("cut4")
def cut4(inp): def cut4(inp):
inp = inp.strip("\n") inp = inp.strip("\n")
opts = ["%s" % item for item in inp.strip(".").split(".")] opts = re.split(r'(?<!\d)\.(?!\d)', inp.strip("."))
opts = [item for item in opts if not set(item).issubset(punctuation)] opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts) return "\n".join(opts)

@ -380,11 +380,7 @@ from text import chinese
def get_phones_and_bert(text,language,version,final=False): def get_phones_and_bert(text,language,version,final=False):
if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}:
language = language.replace("all_","") language = language.replace("all_","")
if language == "en": formattext = text
formattext = text
else:
# 因无法区别中日韩文汉字,以用户输入为准
formattext = text
while " " in formattext: while " " in formattext:
formattext = formattext.replace(" ", " ") formattext = formattext.replace(" ", " ")
if language == "zh": if language == "zh":
@ -738,7 +734,7 @@ def cut3(inp):
def cut4(inp): def cut4(inp):
inp = inp.strip("\n") inp = inp.strip("\n")
opts = ["%s" % item for item in inp.strip(".").split(".")] opts = re.split(r'(?<!\d)\.(?!\d)', inp.strip("."))
opts = [item for item in opts if not set(item).issubset(punctuation)] opts = [item for item in opts if not set(item).issubset(punctuation)]
return "\n".join(opts) return "\n".join(opts)

@ -0,0 +1,275 @@
# by https://github.com/Cosmo-klara
from __future__ import print_function
import re
import inflect
import unicodedata
# 后缀计量单位替换表
measurement_map = {
"m": ["meter", "meters"],
'km': ["kilometer", "kilometers"],
"km/h": ["kilometer per hour", "kilometers per hour"],
"ft": ["feet", "feet"],
"L": ["liter", "liters"],
"tbsp": ["tablespoon", "tablespoons"],
'tsp': ["teaspoon", "teaspoons"],
"h": ["hour", "hours"],
"min": ["minute", "minutes"],
"s": ["second", "seconds"],
"°C": ["degree celsius", "degrees celsius"],
"°F": ["degree fahrenheit", "degrees fahrenheit"]
}
# 识别 12,000 类型
_inflect = inflect.engine()
# 转化数字序数词
_ordinal_number_re = re.compile(r'\b([0-9]+)\. ')
# 我听说好像对于数字正则识别其实用 \d 会好一点
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
# 时间识别
_time_re = re.compile(r'\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b')
# 后缀计量单位识别
_measurement_re = re.compile(r'\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b')
# 前后 £ 识别 ( 写了识别两边某一边的,但是不知道为什么失败了┭┮﹏┭┮ )
_pounds_re_start = re.compile(r'£([0-9\.\,]*[0-9]+)')
_pounds_re_end = re.compile(r'([0-9\.\,]*[0-9]+)£')
# 前后 $ 识别
_dollars_re_start = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_dollars_re_end = re.compile(r'([(0-9\.\,]*[0-9]+)\$')
# 小数的识别
_decimal_number_re = re.compile(r'([0-9]+\.\s*[0-9]+)')
# 分数识别 (形式 "3/4" )
_fraction_re = re.compile(r'([0-9]+/[0-9]+)')
# 序数词识别
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
# 数字处理
_number_re = re.compile(r'[0-9]+')
def _convert_ordinal(m):
"""
标准化序数词, 例如: 1. 2. 3. 4. 5. 6.
Examples:
input: "1. "
output: "1st"
然后在后面的 _expand_ordinal, 将其转化为 first 这类的
"""
ordinal = _inflect.ordinal(m.group(1))
return ordinal + ", "
def _remove_commas(m):
return m.group(1).replace(',', '')
def _expand_time(m):
"""
24 小时制的时间转换为 12 小时制的时间表示方式
Examples:
input: "13:00 / 4:00 / 13:30"
output: "one o'clock p.m. / four o'clock am. / one thirty p.m."
"""
hours, minutes = map(int, m.group(1, 2))
period = 'a.m.' if hours < 12 else 'p.m.'
if hours > 12:
hours -= 12
hour_word = _inflect.number_to_words(hours)
minute_word = _inflect.number_to_words(minutes) if minutes != 0 else ''
if minutes == 0:
return f"{hour_word} o'clock {period}"
else:
return f"{hour_word} {minute_word} {period}"
def _expand_measurement(m):
"""
处理一些常见的测量单位后缀, 目前支持: m, km, km/h, ft, L, tbsp, tsp, h, min, s, °C, °F
如果要拓展的话修改: _measurement_re measurement_map
"""
sign = m.group(3)
ptr = 1
# 想不到怎么方便的取数字又懒得改正则1.2 反正也是复数读法,干脆直接去掉 "."
num = int(m.group(1).replace(sign, '').replace(".",''))
decimal_part = m.group(2)
# 上面判断的漏洞,比如 0.1 的情况,在这里排除了
if decimal_part == None and num == 1:
ptr = 0
return m.group(1).replace(sign, " " + measurement_map[sign][ptr])
def _expand_pounds(m):
"""
没找到特别规范的说明和美元的处理一样其实可以把两个合并在一起
"""
match = m.group(1)
parts = match.split('.')
if len(parts) > 2:
return match + ' pounds' # Unexpected format
pounds = int(parts[0]) if parts[0] else 0
pence = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0
if pounds and pence:
pound_unit = 'pound' if pounds == 1 else 'pounds'
penny_unit = 'penny' if pence == 1 else 'pence'
return '%s %s and %s %s' % (pounds, pound_unit, pence, penny_unit)
elif pounds:
pound_unit = 'pound' if pounds == 1 else 'pounds'
return '%s %s' % (pounds, pound_unit)
elif pence:
penny_unit = 'penny' if pence == 1 else 'pence'
return '%s %s' % (pence, penny_unit)
else:
return 'zero pounds'
def _expand_dollars(m):
"""
change: 美分是 100 的限值, 应该要做补零的吧
Example:
input: "32.3$ / $6.24"
output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents"
"""
match = m.group(1)
parts = match.split('.')
if len(parts) > 2:
return match + ' dollars' # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s and %s %s' % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
return '%s %s' % (dollars, dollar_unit)
elif cents:
cent_unit = 'cent' if cents == 1 else 'cents'
return '%s %s' % (cents, cent_unit)
else:
return 'zero dollars'
# 小数的处理
def _expand_decimal_number(m):
"""
Example:
input: "13.234"
output: "thirteen point two three four"
"""
match = m.group(1)
parts = match.split('.')
words = []
# 遍历字符串中的每个字符
for char in parts[1]:
if char == '.':
words.append("point")
else:
words.append(char)
return parts[0] + " point " + " ".join(words)
# 分数的处理
def _expend_fraction(m):
"""
规则1: 分子使用基数词读法, 分母用序数词读法.
规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法.
规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves.
Examples:
| Written | Said |
|:---:|:---:|
| 1/3 | one third |
| 3/4 | three fourths |
| 5/6 | five sixths |
| 1/2 | one half |
| 3/2 | three halves |
"""
match = m.group(0)
numerator, denominator = map(int, match.split('/'))
numerator_part = _inflect.number_to_words(numerator)
if denominator == 2:
if numerator == 1:
denominator_part = 'half'
else:
denominator_part = 'halves'
elif denominator == 1:
return f'{numerator_part}'
else:
denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator))
if numerator > 1:
denominator_part += 's'
return f'{numerator_part} {denominator_part}'
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return 'two thousand'
elif num > 2000 and num < 2010:
return 'two thousand ' + _inflect.number_to_words(num % 100)
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + ' hundred'
else:
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
else:
return _inflect.number_to_words(num, andword='')
def normalize(text):
"""
!!! 所有的处理都需要正确的输入 !!!
可以添加新的处理只需要添加正则表达式和对应的处理函数即可
"""
text = re.sub(_ordinal_number_re, _convert_ordinal, text)
text = re.sub(r'(?<!\d)-|-(?!\d)', ' minus ', text)
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_time_re, _expand_time, text)
text = re.sub(_measurement_re, _expand_measurement, text)
text = re.sub(_pounds_re_start, _expand_pounds, text)
text = re.sub(_pounds_re_end, _expand_pounds, text)
text = re.sub(_dollars_re_start, _expand_dollars, text)
text = re.sub(_dollars_re_end, _expand_dollars, text)
text = re.sub(_decimal_number_re, _expand_decimal_number, text)
text = re.sub(_fraction_re, _expend_fraction, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents
text = re.sub("%", " percent", text)
text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
text = re.sub(r"(?i)i\.e\.", "that is", text)
text = re.sub(r"(?i)e\.g\.", "for example", text)
# 增加纯大写单词拆分
text = re.sub(r'(?<!^)(?<![\s])([A-Z])', r' \1', text)
return text
if __name__ == '__main__':
# 我觉得其实可以把切分结果展示出来只读或者修改不影响传给TTS的实际text
# 然后让用户确认后再输入给 TTS可以让用户检查自己有没有不标准的输入
print(normalize("1. test ordinal number 1st"))
print(normalize("32.3$, $6.24, 1.1£, £7.14."))
print(normalize("3/23, 1/2, 3/2, 1/3, 6/1"))
print(normalize("1st, 22nd"))
print(normalize("a test 20h, 1.2s, 1L, 0.1km"))
print(normalize("a test of time 4:00, 13:00, 13:30"))
print(normalize("a test of temperature 4°F, 23°C, -19°C"))

@ -10,7 +10,7 @@ from text.symbols2 import symbols
import unicodedata import unicodedata
from builtins import str as unicode from builtins import str as unicode
from g2p_en.expand import normalize_numbers from text.en_normalization.expend import normalize
from nltk.tokenize import TweetTokenizer from nltk.tokenize import TweetTokenizer
word_tokenize = TweetTokenizer().tokenize word_tokenize = TweetTokenizer().tokenize
from nltk import pos_tag from nltk import pos_tag
@ -22,6 +22,17 @@ CMU_DICT_HOT_PATH = os.path.join(current_file_path, "engdict-hot.rep")
CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle") CACHE_PATH = os.path.join(current_file_path, "engdict_cache.pickle")
NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle") NAMECACHE_PATH = os.path.join(current_file_path, "namedict_cache.pickle")
# 适配中文及 g2p_en 标点
rep_map = {
"[;:]": ",",
'["]': "'",
"": ".",
"": "!",
"": "?",
}
arpa = { arpa = {
"AH0", "AH0",
"S", "S",
@ -220,32 +231,16 @@ def get_namedict():
def text_normalize(text): def text_normalize(text):
# todo: eng text normalize # todo: eng text normalize
# 适配中文及 g2p_en 标点
rep_map = { # 效果相同,和 chinese.py 保持一致
"[;:]": ",", pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
'["]': "'", text = pattern.sub(lambda x: rep_map[x.group()], text)
"": ".",
"": "!",
"": "?",
}
for p, r in rep_map.items():
text = re.sub(p, r, text)
# 来自 g2p_en 文本格式化处理
# 增加大写兼容
# 增加纯大写单词拆分
text = unicode(text) text = unicode(text)
text = normalize_numbers(text) text = normalize(text)
text = ''.join(char for char in unicodedata.normalize('NFD', text)
if unicodedata.category(char) != 'Mn') # Strip accents
text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
text = re.sub(r"(?i)i\.e\.", "that is", text)
text = re.sub(r"(?i)e\.g\.", "for example", text)
text = re.sub(r'(?<!^)(?<![\s])([A-Z])', r' \1', text)
# 避免重复标点引起的参考泄露 # 避免重复标点引起的参考泄露
text = replace_consecutive_punctuation(text) text = replace_consecutive_punctuation(text)
return text return text

Loading…
Cancel
Save