|
|
# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
|
|
|
import re
|
|
|
import sys
|
|
|
|
|
|
import pyopenjtalk
|
|
|
|
|
|
from text import symbols
|
|
|
|
|
|
# Regular expression matching Japanese without punctuation marks:
|
|
|
_japanese_characters = re.compile(
|
|
|
r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
|
|
|
|
|
|
# Regular expression matching non-Japanese characters or punctuation marks:
|
|
|
_japanese_marks = re.compile(
|
|
|
r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
|
|
|
|
|
|
# List of (symbol, Japanese) pairs for marks:
|
|
|
_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
|
('%', 'パーセント')
|
|
|
]]
|
|
|
|
|
|
|
|
|
# List of (consonant, sokuon) pairs:
|
|
|
_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
|
(r'Q([↑↓]*[kg])', r'k#\1'),
|
|
|
(r'Q([↑↓]*[tdjʧ])', r't#\1'),
|
|
|
(r'Q([↑↓]*[sʃ])', r's\1'),
|
|
|
(r'Q([↑↓]*[pb])', r'p#\1')
|
|
|
]]
|
|
|
|
|
|
# List of (consonant, hatsuon) pairs:
|
|
|
_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
|
(r'N([↑↓]*[pbm])', r'm\1'),
|
|
|
(r'N([↑↓]*[ʧʥj])', r'n^\1'),
|
|
|
(r'N([↑↓]*[tdn])', r'n\1'),
|
|
|
(r'N([↑↓]*[kg])', r'ŋ\1')
|
|
|
]]
|
|
|
|
|
|
|
|
|
|
|
|
def post_replace_ph(ph):
|
|
|
rep_map = {
|
|
|
':': ',',
|
|
|
';': ',',
|
|
|
',': ',',
|
|
|
'。': '.',
|
|
|
'!': '!',
|
|
|
'?': '?',
|
|
|
'\n': '.',
|
|
|
"·": ",",
|
|
|
'、': ",",
|
|
|
'...': '…'
|
|
|
}
|
|
|
if ph in rep_map.keys():
|
|
|
ph = rep_map[ph]
|
|
|
if ph in symbols:
|
|
|
return ph
|
|
|
if ph not in symbols:
|
|
|
ph = 'UNK'
|
|
|
return ph
|
|
|
|
|
|
def symbols_to_japanese(text):
|
|
|
for regex, replacement in _symbols_to_japanese:
|
|
|
text = re.sub(regex, replacement, text)
|
|
|
return text
|
|
|
|
|
|
|
|
|
def preprocess_jap(text):
|
|
|
'''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
|
|
|
text = symbols_to_japanese(text)
|
|
|
sentences = re.split(_japanese_marks, text)
|
|
|
marks = re.findall(_japanese_marks, text)
|
|
|
text = []
|
|
|
for i, sentence in enumerate(sentences):
|
|
|
if re.match(_japanese_characters, sentence):
|
|
|
p = pyopenjtalk.g2p(sentence)
|
|
|
text += p.split(" ")
|
|
|
|
|
|
if i < len(marks):
|
|
|
text += [marks[i].replace(' ', '')]
|
|
|
return text
|
|
|
|
|
|
def text_normalize(text):
|
|
|
# todo: jap text normalize
|
|
|
return text
|
|
|
|
|
|
def g2p(norm_text):
|
|
|
phones = preprocess_jap(norm_text)
|
|
|
phones = [post_replace_ph(i) for i in phones]
|
|
|
# todo: implement tones and word2ph
|
|
|
return phones
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
for line in open("../../../Downloads/transcript_utf8.txt").readlines():
|
|
|
text = line.split(":")[1]
|
|
|
phones = g2p(text)
|
|
|
print(phones)
|