From 9a56d7cb666a1ac69587abb95a522c7dbeef7f98 Mon Sep 17 00:00:00 2001 From: lipku Date: Fri, 7 Feb 2025 08:01:19 +0800 Subject: [PATCH] add fish-speech tts --- README.md | 3 ++- basereal.py | 4 ++- ttsreal.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index de3a14d..ba6c7a6 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,8 @@ Real time interactive streaming digital human, realize audio video synchronous - 2024.12.8 完善多并发,显存不随并发数增加 - 2024.12.21 添加wav2lip、musetalk模型预热,解决第一次推理卡顿问题。感谢@heimaojinzhangyz - 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017 -- 2025.1.26 添加wav2lip384模型 感谢@不蠢不蠢 +- 2025.1.26 添加wav2lip384开源模型 感谢@不蠢不蠢 +- 2025.2.7 添加fish-speech tts ## Features 1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human diff --git a/basereal.py b/basereal.py index 5136aba..939217b 100644 --- a/basereal.py +++ b/basereal.py @@ -35,7 +35,7 @@ import soundfile as sf import av from fractions import Fraction -from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS +from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS,FishTTS from tqdm import tqdm def read_imgs(img_list): @@ -61,6 +61,8 @@ class BaseReal: self.tts = XTTS(opt,self) elif opt.tts == "cosyvoice": self.tts = CosyVoiceTTS(opt,self) + elif opt.tts == "fishtts": + self.tts = FishTTS(opt,self) self.speaking = False diff --git a/ttsreal.py b/ttsreal.py index ed3a1da..7ddf12e 100644 --- a/ttsreal.py +++ b/ttsreal.py @@ -139,6 +139,82 @@ class EdgeTTS(BaseTTS): except Exception as e: print(e) +########################################################################################### +class FishTTS(BaseTTS): + def txt_to_audio(self,msg): + text,textevent = msg + self.stream_tts( + self.fish_speech( + text, + self.opt.REF_FILE, + self.opt.REF_TEXT, + "zh", #en args.language, + self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url, + ), + msg + ) + + def fish_speech(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]: + start = time.perf_counter() + req={ + 'text':text, + 'reference_id':reffile, + 'format':'wav', + 'streaming':True, + 'use_memory_cache':'on' + } + try: + res = requests.post( + f"{server_url}/v1/tts", + json=req, + stream=True, + headers={ + "content-type": "application/json", + }, + ) + end = time.perf_counter() + print(f"fish_speech Time to make POST: {end-start}s") + + if res.status_code != 200: + print("Error:", res.text) + return + + first = True + + for chunk in res.iter_content(chunk_size=17640): # 1764 44100*20ms*2 + #print('chunk len:',len(chunk)) + if first: + end = time.perf_counter() + print(f"fish_speech Time to first chunk: {end-start}s") + first = False + if chunk and self.state==State.RUNNING: + yield chunk + #print("gpt_sovits response.elapsed:", res.elapsed) + except Exception as e: + print(e) + + def stream_tts(self,audio_stream,msg): + text,textevent = msg + first = True + for chunk in audio_stream: + if chunk is not None and len(chunk)>0: + stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767 + stream = resampy.resample(x=stream, sr_orig=44100, sr_new=self.sample_rate) + #byte_stream=BytesIO(buffer) + #stream = self.__create_bytes_stream(byte_stream) + streamlen = stream.shape[0] + idx=0 + while streamlen >= self.chunk: + eventpoint=None + if first: + eventpoint={'status':'start','text':text,'msgenvent':textevent} + first = False + self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint) + streamlen -= self.chunk + idx += self.chunk + eventpoint={'status':'end','text':text,'msgenvent':textevent} + self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint) + ########################################################################################### class VoitsTTS(BaseTTS): def txt_to_audio(self,msg):