From c17dd642c774d74b2e765c08063755f4c6c62109 Mon Sep 17 00:00:00 2001 From: KamioRinn <63162909+KamioRinn@users.noreply.github.com> Date: Mon, 17 Feb 2025 18:41:30 +0800 Subject: [PATCH] Add en_normalization and fix LangSegmenter (#2062) --- GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py | 18 +- .../text_segmentation_method.py | 2 +- GPT_SoVITS/inference_webui.py | 8 +- GPT_SoVITS/text/en_normalization/expend.py | 275 ++++++++++++++++++ GPT_SoVITS/text/english.py | 41 ++- 5 files changed, 302 insertions(+), 42 deletions(-) create mode 100644 GPT_SoVITS/text/en_normalization/expend.py diff --git a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py index 782e15e..9def3da 100644 --- a/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py +++ b/GPT_SoVITS/TTS_infer_pack/TextPreprocessor.py @@ -7,7 +7,7 @@ sys.path.append(now_dir) import re import torch -import LangSegment +from text.LangSegmenter import LangSegmenter from text import chinese from typing import Dict, List, Tuple from text.cleaner import clean_text @@ -20,7 +20,7 @@ from tools.i18n.i18n import I18nAuto, scan_language_list language=os.environ.get("language","Auto") language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language i18n = I18nAuto(language=language) -punctuation = set(['!', '?', '…', ',', '.', '-'," "]) +punctuation = set(['!', '?', '…', ',', '.', '-']) def get_first(text:str) -> str: pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]" @@ -119,12 +119,7 @@ class TextPreprocessor: def get_phones_and_bert(self, text:str, language:str, version:str, final:bool=False): if language in {"en", "all_zh", "all_ja", "all_ko", "all_yue"}: language = language.replace("all_","") - if language == "en": - LangSegment.setfilters(["en"]) - formattext = " ".join(tmp["text"] for tmp in LangSegment.getTexts(text)) - else: - # 因无法区别中日韩文汉字,以用户输入为准 - formattext = text + formattext = text while " " in formattext: formattext = formattext.replace(" ", " ") if language == "zh": @@ -148,19 +143,18 @@ class TextPreprocessor: elif language in {"zh", "ja", "ko", "yue", "auto", "auto_yue"}: textlist=[] langlist=[] - LangSegment.setfilters(["zh","ja","en","ko"]) if language == "auto": - for tmp in LangSegment.getTexts(text): + for tmp in LangSegmenter.getTexts(text): langlist.append(tmp["lang"]) textlist.append(tmp["text"]) elif language == "auto_yue": - for tmp in LangSegment.getTexts(text): + for tmp in LangSegmenter.getTexts(text): if tmp["lang"] == "zh": tmp["lang"] = "yue" langlist.append(tmp["lang"]) textlist.append(tmp["text"]) else: - for tmp in LangSegment.getTexts(text): + for tmp in LangSegmenter.getTexts(text): if tmp["lang"] == "en": langlist.append(tmp["lang"]) else: diff --git a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py index 396c61f..4ee0cfb 100644 --- a/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py +++ b/GPT_SoVITS/TTS_infer_pack/text_segmentation_method.py @@ -135,7 +135,7 @@ def cut3(inp): @register_method("cut4") def cut4(inp): inp = inp.strip("\n") - opts = ["%s" % item for item in inp.strip(".").split(".")] + opts = re.split(r'(? 12: + hours -= 12 + + hour_word = _inflect.number_to_words(hours) + minute_word = _inflect.number_to_words(minutes) if minutes != 0 else '' + + if minutes == 0: + return f"{hour_word} o'clock {period}" + else: + return f"{hour_word} {minute_word} {period}" + + +def _expand_measurement(m): + """ + 处理一些常见的测量单位后缀, 目前支持: m, km, km/h, ft, L, tbsp, tsp, h, min, s, °C, °F + 如果要拓展的话修改: _measurement_re 和 measurement_map + """ + sign = m.group(3) + ptr = 1 + # 想不到怎么方便的取数字,又懒得改正则,诶,1.2 反正也是复数读法,干脆直接去掉 "." + num = int(m.group(1).replace(sign, '').replace(".",'')) + decimal_part = m.group(2) + # 上面判断的漏洞,比如 0.1 的情况,在这里排除了 + if decimal_part == None and num == 1: + ptr = 0 + return m.group(1).replace(sign, " " + measurement_map[sign][ptr]) + + +def _expand_pounds(m): + """ + 没找到特别规范的说明,和美元的处理一样,其实可以把两个合并在一起 + """ + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' pounds' # Unexpected format + pounds = int(parts[0]) if parts[0] else 0 + pence = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0 + if pounds and pence: + pound_unit = 'pound' if pounds == 1 else 'pounds' + penny_unit = 'penny' if pence == 1 else 'pence' + return '%s %s and %s %s' % (pounds, pound_unit, pence, penny_unit) + elif pounds: + pound_unit = 'pound' if pounds == 1 else 'pounds' + return '%s %s' % (pounds, pound_unit) + elif pence: + penny_unit = 'penny' if pence == 1 else 'pence' + return '%s %s' % (pence, penny_unit) + else: + return 'zero pounds' + +def _expand_dollars(m): + """ + change: 美分是 100 的限值, 应该要做补零的吧 + Example: + input: "32.3$ / $6.24" + output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents" + """ + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s and %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + +# 小数的处理 +def _expand_decimal_number(m): + """ + Example: + input: "13.234" + output: "thirteen point two three four" + """ + match = m.group(1) + parts = match.split('.') + words = [] + # 遍历字符串中的每个字符 + for char in parts[1]: + if char == '.': + words.append("point") + else: + words.append(char) + return parts[0] + " point " + " ".join(words) + + +# 分数的处理 +def _expend_fraction(m): + """ + 规则1: 分子使用基数词读法, 分母用序数词读法. + 规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法. + 规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves. + Examples: + + | Written | Said | + |:---:|:---:| + | 1/3 | one third | + | 3/4 | three fourths | + | 5/6 | five sixths | + | 1/2 | one half | + | 3/2 | three halves | + """ + match = m.group(0) + numerator, denominator = map(int, match.split('/')) + + numerator_part = _inflect.number_to_words(numerator) + if denominator == 2: + if numerator == 1: + denominator_part = 'half' + else: + denominator_part = 'halves' + elif denominator == 1: + return f'{numerator_part}' + else: + denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator)) + if numerator > 1: + denominator_part += 's' + + return f'{numerator_part} {denominator_part}' + +def _expand_ordinal(m): + return _inflect.number_to_words(m.group(0)) + +def _expand_number(m): + num = int(m.group(0)) + if num > 1000 and num < 3000: + if num == 2000: + return 'two thousand' + elif num > 2000 and num < 2010: + return 'two thousand ' + _inflect.number_to_words(num % 100) + elif num % 100 == 0: + return _inflect.number_to_words(num // 100) + ' hundred' + else: + return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') + else: + return _inflect.number_to_words(num, andword='') + + +def normalize(text): + """ + !!! 所有的处理都需要正确的输入 !!! + 可以添加新的处理,只需要添加正则表达式和对应的处理函数即可 + """ + + text = re.sub(_ordinal_number_re, _convert_ordinal, text) + text = re.sub(r'(?