|
|
@ -14,7 +14,7 @@
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
# limitations under the License.
|
|
|
|
###############################################################################
|
|
|
|
###############################################################################
|
|
|
|
|
|
|
|
import os
|
|
|
|
import time
|
|
|
|
import time
|
|
|
|
import numpy as np
|
|
|
|
import numpy as np
|
|
|
|
import soundfile as sf
|
|
|
|
import soundfile as sf
|
|
|
@ -53,9 +53,9 @@ class BaseTTS:
|
|
|
|
self.msgqueue.queue.clear()
|
|
|
|
self.msgqueue.queue.clear()
|
|
|
|
self.state = State.PAUSE
|
|
|
|
self.state = State.PAUSE
|
|
|
|
|
|
|
|
|
|
|
|
def put_msg_txt(self,msg):
|
|
|
|
def put_msg_txt(self,msg,eventpoint=None):
|
|
|
|
if len(msg)>0:
|
|
|
|
if len(msg)>0:
|
|
|
|
self.msgqueue.put(msg)
|
|
|
|
self.msgqueue.put((msg,eventpoint))
|
|
|
|
|
|
|
|
|
|
|
|
def render(self,quit_event):
|
|
|
|
def render(self,quit_event):
|
|
|
|
process_thread = Thread(target=self.process_tts, args=(quit_event,))
|
|
|
|
process_thread = Thread(target=self.process_tts, args=(quit_event,))
|
|
|
@ -79,7 +79,7 @@ class BaseTTS:
|
|
|
|
class EdgeTTS(BaseTTS):
|
|
|
|
class EdgeTTS(BaseTTS):
|
|
|
|
def txt_to_audio(self,msg):
|
|
|
|
def txt_to_audio(self,msg):
|
|
|
|
voicename = "zh-CN-YunxiaNeural"
|
|
|
|
voicename = "zh-CN-YunxiaNeural"
|
|
|
|
text = msg
|
|
|
|
text,textevent = msg
|
|
|
|
t = time.time()
|
|
|
|
t = time.time()
|
|
|
|
asyncio.new_event_loop().run_until_complete(self.__main(voicename,text))
|
|
|
|
asyncio.new_event_loop().run_until_complete(self.__main(voicename,text))
|
|
|
|
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
|
|
|
print(f'-------edge tts time:{time.time()-t:.4f}s')
|
|
|
@ -92,8 +92,13 @@ class EdgeTTS(BaseTTS):
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
idx=0
|
|
|
|
idx=0
|
|
|
|
while streamlen >= self.chunk and self.state==State.RUNNING:
|
|
|
|
while streamlen >= self.chunk and self.state==State.RUNNING:
|
|
|
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk])
|
|
|
|
eventpoint=None
|
|
|
|
streamlen -= self.chunk
|
|
|
|
streamlen -= self.chunk
|
|
|
|
|
|
|
|
if idx==0:
|
|
|
|
|
|
|
|
eventpoint={'status':'start','text':text,'msgenvent':textevent}
|
|
|
|
|
|
|
|
elif streamlen<self.chunk:
|
|
|
|
|
|
|
|
eventpoint={'status':'end','text':text,'msgenvent':textevent}
|
|
|
|
|
|
|
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
|
|
|
|
idx += self.chunk
|
|
|
|
idx += self.chunk
|
|
|
|
#if streamlen>0: #skip last frame(not 20ms)
|
|
|
|
#if streamlen>0: #skip last frame(not 20ms)
|
|
|
|
# self.queue.put(stream[idx:])
|
|
|
|
# self.queue.put(stream[idx:])
|
|
|
@ -137,14 +142,16 @@ class EdgeTTS(BaseTTS):
|
|
|
|
###########################################################################################
|
|
|
|
###########################################################################################
|
|
|
|
class VoitsTTS(BaseTTS):
|
|
|
|
class VoitsTTS(BaseTTS):
|
|
|
|
def txt_to_audio(self,msg):
|
|
|
|
def txt_to_audio(self,msg):
|
|
|
|
|
|
|
|
text,textevent = msg
|
|
|
|
self.stream_tts(
|
|
|
|
self.stream_tts(
|
|
|
|
self.gpt_sovits(
|
|
|
|
self.gpt_sovits(
|
|
|
|
msg,
|
|
|
|
text,
|
|
|
|
self.opt.REF_FILE,
|
|
|
|
self.opt.REF_FILE,
|
|
|
|
self.opt.REF_TEXT,
|
|
|
|
self.opt.REF_TEXT,
|
|
|
|
"zh", #en args.language,
|
|
|
|
"zh", #en args.language,
|
|
|
|
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
|
|
|
|
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
|
|
|
|
)
|
|
|
|
),
|
|
|
|
|
|
|
|
msg
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def gpt_sovits(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
|
|
|
|
def gpt_sovits(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
|
|
|
@ -207,7 +214,9 @@ class VoitsTTS(BaseTTS):
|
|
|
|
|
|
|
|
|
|
|
|
return stream
|
|
|
|
return stream
|
|
|
|
|
|
|
|
|
|
|
|
def stream_tts(self,audio_stream):
|
|
|
|
def stream_tts(self,audio_stream,msg):
|
|
|
|
|
|
|
|
text,textevent = msg
|
|
|
|
|
|
|
|
first = True
|
|
|
|
for chunk in audio_stream:
|
|
|
|
for chunk in audio_stream:
|
|
|
|
if chunk is not None and len(chunk)>0:
|
|
|
|
if chunk is not None and len(chunk)>0:
|
|
|
|
#stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
|
|
|
#stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
|
|
@ -217,21 +226,29 @@ class VoitsTTS(BaseTTS):
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
idx=0
|
|
|
|
idx=0
|
|
|
|
while streamlen >= self.chunk:
|
|
|
|
while streamlen >= self.chunk:
|
|
|
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk])
|
|
|
|
eventpoint=None
|
|
|
|
|
|
|
|
if first:
|
|
|
|
|
|
|
|
eventpoint={'status':'start','text':text,'msgenvent':textevent}
|
|
|
|
|
|
|
|
first = False
|
|
|
|
|
|
|
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
|
|
|
|
streamlen -= self.chunk
|
|
|
|
streamlen -= self.chunk
|
|
|
|
idx += self.chunk
|
|
|
|
idx += self.chunk
|
|
|
|
|
|
|
|
eventpoint={'status':'end','text':text,'msgenvent':textevent}
|
|
|
|
|
|
|
|
self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint)
|
|
|
|
|
|
|
|
|
|
|
|
###########################################################################################
|
|
|
|
###########################################################################################
|
|
|
|
class CosyVoiceTTS(BaseTTS):
|
|
|
|
class CosyVoiceTTS(BaseTTS):
|
|
|
|
def txt_to_audio(self,msg):
|
|
|
|
def txt_to_audio(self,msg):
|
|
|
|
|
|
|
|
text,textevent = msg
|
|
|
|
self.stream_tts(
|
|
|
|
self.stream_tts(
|
|
|
|
self.cosy_voice(
|
|
|
|
self.cosy_voice(
|
|
|
|
msg,
|
|
|
|
text,
|
|
|
|
self.opt.REF_FILE,
|
|
|
|
self.opt.REF_FILE,
|
|
|
|
self.opt.REF_TEXT,
|
|
|
|
self.opt.REF_TEXT,
|
|
|
|
"zh", #en args.language,
|
|
|
|
"zh", #en args.language,
|
|
|
|
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
|
|
|
|
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
|
|
|
|
)
|
|
|
|
),
|
|
|
|
|
|
|
|
msg
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def cosy_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
|
|
|
|
def cosy_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
|
|
|
@ -263,7 +280,9 @@ class CosyVoiceTTS(BaseTTS):
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
|
|
def stream_tts(self,audio_stream):
|
|
|
|
def stream_tts(self,audio_stream,msg):
|
|
|
|
|
|
|
|
text,textevent = msg
|
|
|
|
|
|
|
|
first = True
|
|
|
|
for chunk in audio_stream:
|
|
|
|
for chunk in audio_stream:
|
|
|
|
if chunk is not None and len(chunk)>0:
|
|
|
|
if chunk is not None and len(chunk)>0:
|
|
|
|
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
|
|
|
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
|
|
@ -273,9 +292,15 @@ class CosyVoiceTTS(BaseTTS):
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
idx=0
|
|
|
|
idx=0
|
|
|
|
while streamlen >= self.chunk:
|
|
|
|
while streamlen >= self.chunk:
|
|
|
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk])
|
|
|
|
eventpoint=None
|
|
|
|
|
|
|
|
if first:
|
|
|
|
|
|
|
|
eventpoint={'status':'start','text':text,'msgenvent':textevent}
|
|
|
|
|
|
|
|
first = False
|
|
|
|
|
|
|
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
|
|
|
|
streamlen -= self.chunk
|
|
|
|
streamlen -= self.chunk
|
|
|
|
idx += self.chunk
|
|
|
|
idx += self.chunk
|
|
|
|
|
|
|
|
eventpoint={'status':'end','text':text,'msgenvent':textevent}
|
|
|
|
|
|
|
|
self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint)
|
|
|
|
|
|
|
|
|
|
|
|
###########################################################################################
|
|
|
|
###########################################################################################
|
|
|
|
class XTTS(BaseTTS):
|
|
|
|
class XTTS(BaseTTS):
|
|
|
@ -284,14 +309,16 @@ class XTTS(BaseTTS):
|
|
|
|
self.speaker = self.get_speaker(opt.REF_FILE, opt.TTS_SERVER)
|
|
|
|
self.speaker = self.get_speaker(opt.REF_FILE, opt.TTS_SERVER)
|
|
|
|
|
|
|
|
|
|
|
|
def txt_to_audio(self,msg):
|
|
|
|
def txt_to_audio(self,msg):
|
|
|
|
|
|
|
|
text,textevent = msg
|
|
|
|
self.stream_tts(
|
|
|
|
self.stream_tts(
|
|
|
|
self.xtts(
|
|
|
|
self.xtts(
|
|
|
|
msg,
|
|
|
|
text,
|
|
|
|
self.speaker,
|
|
|
|
self.speaker,
|
|
|
|
"zh-cn", #en args.language,
|
|
|
|
"zh-cn", #en args.language,
|
|
|
|
self.opt.TTS_SERVER, #"http://localhost:9000", #args.server_url,
|
|
|
|
self.opt.TTS_SERVER, #"http://localhost:9000", #args.server_url,
|
|
|
|
"20" #args.stream_chunk_size
|
|
|
|
"20" #args.stream_chunk_size
|
|
|
|
)
|
|
|
|
),
|
|
|
|
|
|
|
|
msg
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def get_speaker(self,ref_audio,server_url):
|
|
|
|
def get_speaker(self,ref_audio,server_url):
|
|
|
@ -329,7 +356,9 @@ class XTTS(BaseTTS):
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
|
|
def stream_tts(self,audio_stream):
|
|
|
|
def stream_tts(self,audio_stream,msg):
|
|
|
|
|
|
|
|
text,textevent = msg
|
|
|
|
|
|
|
|
first = True
|
|
|
|
for chunk in audio_stream:
|
|
|
|
for chunk in audio_stream:
|
|
|
|
if chunk is not None and len(chunk)>0:
|
|
|
|
if chunk is not None and len(chunk)>0:
|
|
|
|
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
|
|
|
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
|
|
@ -339,6 +368,12 @@ class XTTS(BaseTTS):
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
idx=0
|
|
|
|
idx=0
|
|
|
|
while streamlen >= self.chunk:
|
|
|
|
while streamlen >= self.chunk:
|
|
|
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk])
|
|
|
|
eventpoint=None
|
|
|
|
|
|
|
|
if first:
|
|
|
|
|
|
|
|
eventpoint={'status':'start','text':text,'msgenvent':textevent}
|
|
|
|
|
|
|
|
first = False
|
|
|
|
|
|
|
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
|
|
|
|
streamlen -= self.chunk
|
|
|
|
streamlen -= self.chunk
|
|
|
|
idx += self.chunk
|
|
|
|
idx += self.chunk
|
|
|
|
|
|
|
|
eventpoint={'status':'end','text':text,'msgenvent':textevent}
|
|
|
|
|
|
|
|
self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint)
|