|
|
@ -33,6 +33,7 @@ from .num import RE_NUMBER
|
|
|
|
from .num import RE_PERCENTAGE
|
|
|
|
from .num import RE_PERCENTAGE
|
|
|
|
from .num import RE_POSITIVE_QUANTIFIERS
|
|
|
|
from .num import RE_POSITIVE_QUANTIFIERS
|
|
|
|
from .num import RE_RANGE
|
|
|
|
from .num import RE_RANGE
|
|
|
|
|
|
|
|
from .num import RE_TO_RANGE
|
|
|
|
from .num import replace_default_num
|
|
|
|
from .num import replace_default_num
|
|
|
|
from .num import replace_frac
|
|
|
|
from .num import replace_frac
|
|
|
|
from .num import replace_negative_num
|
|
|
|
from .num import replace_negative_num
|
|
|
@ -40,6 +41,7 @@ from .num import replace_number
|
|
|
|
from .num import replace_percentage
|
|
|
|
from .num import replace_percentage
|
|
|
|
from .num import replace_positive_quantifier
|
|
|
|
from .num import replace_positive_quantifier
|
|
|
|
from .num import replace_range
|
|
|
|
from .num import replace_range
|
|
|
|
|
|
|
|
from .num import replace_to_range
|
|
|
|
from .phonecode import RE_MOBILE_PHONE
|
|
|
|
from .phonecode import RE_MOBILE_PHONE
|
|
|
|
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
|
|
|
|
from .phonecode import RE_NATIONAL_UNIFORM_NUMBER
|
|
|
|
from .phonecode import RE_TELEPHONE
|
|
|
|
from .phonecode import RE_TELEPHONE
|
|
|
@ -65,7 +67,7 @@ class TextNormalizer():
|
|
|
|
if lang == "zh":
|
|
|
|
if lang == "zh":
|
|
|
|
text = text.replace(" ", "")
|
|
|
|
text = text.replace(" ", "")
|
|
|
|
# 过滤掉特殊字符
|
|
|
|
# 过滤掉特殊字符
|
|
|
|
text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|…\\]', '', text)
|
|
|
|
text = re.sub(r'[——《》【】<=>{}()()#&@“”^_|\\]', '', text)
|
|
|
|
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
|
|
|
|
text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
|
|
|
|
text = text.strip()
|
|
|
|
text = text.strip()
|
|
|
|
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
|
|
|
sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
|
|
|
@ -73,8 +75,8 @@ class TextNormalizer():
|
|
|
|
|
|
|
|
|
|
|
|
def _post_replace(self, sentence: str) -> str:
|
|
|
|
def _post_replace(self, sentence: str) -> str:
|
|
|
|
sentence = sentence.replace('/', '每')
|
|
|
|
sentence = sentence.replace('/', '每')
|
|
|
|
sentence = sentence.replace('~', '至')
|
|
|
|
# sentence = sentence.replace('~', '至')
|
|
|
|
sentence = sentence.replace('~', '至')
|
|
|
|
# sentence = sentence.replace('~', '至')
|
|
|
|
sentence = sentence.replace('①', '一')
|
|
|
|
sentence = sentence.replace('①', '一')
|
|
|
|
sentence = sentence.replace('②', '二')
|
|
|
|
sentence = sentence.replace('②', '二')
|
|
|
|
sentence = sentence.replace('③', '三')
|
|
|
|
sentence = sentence.replace('③', '三')
|
|
|
@ -128,6 +130,8 @@ class TextNormalizer():
|
|
|
|
sentence = RE_TIME_RANGE.sub(replace_time, sentence)
|
|
|
|
sentence = RE_TIME_RANGE.sub(replace_time, sentence)
|
|
|
|
sentence = RE_TIME.sub(replace_time, sentence)
|
|
|
|
sentence = RE_TIME.sub(replace_time, sentence)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 处理~波浪号作为至的替换
|
|
|
|
|
|
|
|
sentence = RE_TO_RANGE.sub(replace_to_range, sentence)
|
|
|
|
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
|
|
|
|
sentence = RE_TEMPERATURE.sub(replace_temperature, sentence)
|
|
|
|
sentence = replace_measure(sentence)
|
|
|
|
sentence = replace_measure(sentence)
|
|
|
|
sentence = RE_FRAC.sub(replace_frac, sentence)
|
|
|
|
sentence = RE_FRAC.sub(replace_frac, sentence)
|
|
|
|