|
|
@ -75,7 +75,7 @@ def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[byt
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
first = True
|
|
|
|
first = True
|
|
|
|
for chunk in res.iter_content(chunk_size=960):
|
|
|
|
for chunk in res.iter_content(chunk_size=960): #24K*20ms*2
|
|
|
|
if first:
|
|
|
|
if first:
|
|
|
|
end = time.perf_counter()
|
|
|
|
end = time.perf_counter()
|
|
|
|
print(f"xtts Time to first chunk: {end-start}s")
|
|
|
|
print(f"xtts Time to first chunk: {end-start}s")
|
|
|
@ -85,12 +85,20 @@ def xtts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[byt
|
|
|
|
|
|
|
|
|
|
|
|
print("xtts response.elapsed:", res.elapsed)
|
|
|
|
print("xtts response.elapsed:", res.elapsed)
|
|
|
|
|
|
|
|
|
|
|
|
def gpt_sovits(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
|
|
|
|
def gpt_sovits(text, character, language, server_url, stream_chunk_size) -> Iterator[bytes]:
|
|
|
|
start = time.perf_counter()
|
|
|
|
start = time.perf_counter()
|
|
|
|
speaker["text"] = text
|
|
|
|
req={}
|
|
|
|
speaker["language"] = language
|
|
|
|
req["text"] = text
|
|
|
|
speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
|
|
|
req["text_language"] = language
|
|
|
|
res = requests.get(f"{server_url}&text="+text,stream=True)
|
|
|
|
req["character"] = character
|
|
|
|
|
|
|
|
#req["emotion"] = emotion
|
|
|
|
|
|
|
|
#req["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
|
|
|
|
|
|
|
|
req["stream"] = True
|
|
|
|
|
|
|
|
res = requests.post(
|
|
|
|
|
|
|
|
f"{server_url}/tts",
|
|
|
|
|
|
|
|
json=req,
|
|
|
|
|
|
|
|
stream=True,
|
|
|
|
|
|
|
|
)
|
|
|
|
end = time.perf_counter()
|
|
|
|
end = time.perf_counter()
|
|
|
|
print(f"gpt_sovits Time to make POST: {end-start}s")
|
|
|
|
print(f"gpt_sovits Time to make POST: {end-start}s")
|
|
|
|
|
|
|
|
|
|
|
@ -99,7 +107,7 @@ def gpt_sovits(text, speaker, language, server_url, stream_chunk_size) -> Iterat
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
first = True
|
|
|
|
first = True
|
|
|
|
for chunk in res.iter_content(chunk_size=960):
|
|
|
|
for chunk in res.iter_content(chunk_size=1280): #32K*20ms*2
|
|
|
|
if first:
|
|
|
|
if first:
|
|
|
|
end = time.perf_counter()
|
|
|
|
end = time.perf_counter()
|
|
|
|
print(f"gpt_sovits Time to first chunk: {end-start}s")
|
|
|
|
print(f"gpt_sovits Time to first chunk: {end-start}s")
|
|
|
@ -109,7 +117,7 @@ def gpt_sovits(text, speaker, language, server_url, stream_chunk_size) -> Iterat
|
|
|
|
|
|
|
|
|
|
|
|
print("gpt_sovits response.elapsed:", res.elapsed)
|
|
|
|
print("gpt_sovits response.elapsed:", res.elapsed)
|
|
|
|
|
|
|
|
|
|
|
|
def stream_xtts(audio_stream,render):
|
|
|
|
def stream_tts(audio_stream,render):
|
|
|
|
for chunk in audio_stream:
|
|
|
|
for chunk in audio_stream:
|
|
|
|
if chunk is not None:
|
|
|
|
if chunk is not None:
|
|
|
|
render.push_audio(chunk)
|
|
|
|
render.push_audio(chunk)
|
|
|
@ -121,19 +129,19 @@ def txt_to_audio(text_):
|
|
|
|
t = time.time()
|
|
|
|
t = time.time()
|
|
|
|
asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
|
|
|
|
asyncio.get_event_loop().run_until_complete(main(voicename,text,nerfreal))
|
|
|
|
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
|
|
|
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
|
|
|
elif tts_type == "gpt": #gpt_sovits
|
|
|
|
elif tts_type == "gpt-sovits": #gpt_sovits
|
|
|
|
stream_xtts(
|
|
|
|
stream_tts(
|
|
|
|
gpt_sovits(
|
|
|
|
gpt_sovits(
|
|
|
|
text_,
|
|
|
|
text_,
|
|
|
|
gspeaker,
|
|
|
|
"test", #character
|
|
|
|
"zh-cn", #en args.language,
|
|
|
|
"zh", #en args.language,
|
|
|
|
"http://127.0.0.1:9880/tts_ava?ava=maimai&streaming_mode=true", #args.server_url,
|
|
|
|
"http://127.0.0.1:5000", #args.server_url,
|
|
|
|
"20" #args.stream_chunk_size
|
|
|
|
"20" #args.stream_chunk_size
|
|
|
|
),
|
|
|
|
),
|
|
|
|
nerfreal
|
|
|
|
nerfreal
|
|
|
|
)
|
|
|
|
)
|
|
|
|
else: #xtts
|
|
|
|
else: #xtts
|
|
|
|
stream_xtts(
|
|
|
|
stream_tts(
|
|
|
|
xtts(
|
|
|
|
xtts(
|
|
|
|
text_,
|
|
|
|
text_,
|
|
|
|
gspeaker,
|
|
|
|
gspeaker,
|
|
|
@ -354,18 +362,18 @@ if __name__ == '__main__':
|
|
|
|
parser.add_argument('--fullbody_offset_x', type=int, default=0)
|
|
|
|
parser.add_argument('--fullbody_offset_x', type=int, default=0)
|
|
|
|
parser.add_argument('--fullbody_offset_y', type=int, default=0)
|
|
|
|
parser.add_argument('--fullbody_offset_y', type=int, default=0)
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument('--tts', type=str, default='edgetts') #xtts
|
|
|
|
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
|
|
|
|
parser.add_argument('--ref_file', type=str, default=None)
|
|
|
|
parser.add_argument('--ref_file', type=str, default=None)
|
|
|
|
parser.add_argument('--xtts_server', type=str, default='http://localhost:9000')
|
|
|
|
parser.add_argument('--tts_server', type=str, default='http://localhost:9000')
|
|
|
|
|
|
|
|
|
|
|
|
opt = parser.parse_args()
|
|
|
|
opt = parser.parse_args()
|
|
|
|
app.config.from_object(opt)
|
|
|
|
app.config.from_object(opt)
|
|
|
|
#print(app.config['xtts_server'])
|
|
|
|
#print(app.config['tts_server'])
|
|
|
|
|
|
|
|
|
|
|
|
tts_type = opt.tts
|
|
|
|
tts_type = opt.tts
|
|
|
|
if tts_type == "xtts":
|
|
|
|
if tts_type == "xtts":
|
|
|
|
print("Computing the latents for a new reference...")
|
|
|
|
print("Computing the latents for a new reference...")
|
|
|
|
gspeaker = get_speaker(opt.ref_file, opt.xtts_server)
|
|
|
|
gspeaker = get_speaker(opt.ref_file, opt.tts_server)
|
|
|
|
|
|
|
|
|
|
|
|
# assert test mode
|
|
|
|
# assert test mode
|
|
|
|
opt.test = True
|
|
|
|
opt.test = True
|
|
|
|