add fish-speech tts

6 months ago · 9a56d7cb66
parent 3d9d16a2fb
commit 9a56d7cb66
3 changed files with 81 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -9,7 +9,8 @@ Real time interactive streaming digital human， realize audio video synchronous
 - 2024.12.8 完善多并发，显存不随并发数增加
 - 2024.12.21 添加wav2lip、musetalk模型预热，解决第一次推理卡顿问题。感谢@heimaojinzhangyz
 - 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017
- 2025.1.26 添加wav2lip384模型 感谢@不蠢不蠢
+- 2025.1.26 添加wav2lip384开源模型 感谢@不蠢不蠢
 - 2025.2.7 添加fish-speech tts
 ## Features
 1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human
--- a/basereal.py
+++ b/basereal.py
@ -35,7 +35,7 @@ import soundfile as sf
 import av
 from fractions import Fraction
-from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS
+from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS,FishTTS
 from tqdm import tqdm
 def read_imgs(img_list):
@ -61,6 +61,8 @@ class BaseReal:
            self.tts = XTTS(opt,self)
        elif opt.tts == "cosyvoice":
            self.tts = CosyVoiceTTS(opt,self)
        elif opt.tts == "fishtts":
            self.tts = FishTTS(opt,self)
        self.speaking = False
--- a/ttsreal.py
+++ b/ttsreal.py
@ -139,6 +139,82 @@ class EdgeTTS(BaseTTS):
        except Exception as e:
            print(e)
 ###########################################################################################
 class FishTTS(BaseTTS):
    def txt_to_audio(self,msg): 
        text,textevent = msg
        self.stream_tts(
            self.fish_speech(
                text,
                self.opt.REF_FILE,  
                self.opt.REF_TEXT,
                "zh", #en args.language,
                self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
            ),
            msg
        )
    def fish_speech(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
        start = time.perf_counter()
        req={
            'text':text,
            'reference_id':reffile,
            'format':'wav',
            'streaming':True,
            'use_memory_cache':'on'
        }
        try:
            res = requests.post(
                f"{server_url}/v1/tts",
                json=req,
                stream=True,
                headers={
                    "content-type": "application/json",
                },
            )
            end = time.perf_counter()
            print(f"fish_speech Time to make POST: {end-start}s")
            if res.status_code != 200:
                print("Error:", res.text)
                return
            first = True
            for chunk in res.iter_content(chunk_size=17640): # 1764 44100*20ms*2
                #print('chunk len:',len(chunk))
                if first:
                    end = time.perf_counter()
                    print(f"fish_speech Time to first chunk: {end-start}s")
                    first = False
                if chunk and self.state==State.RUNNING:
                    yield chunk
            #print("gpt_sovits response.elapsed:", res.elapsed)
        except Exception as e:
            print(e)
    def stream_tts(self,audio_stream,msg):
        text,textevent = msg
        first = True
        for chunk in audio_stream:
            if chunk is not None and len(chunk)>0:          
                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
                stream = resampy.resample(x=stream, sr_orig=44100, sr_new=self.sample_rate)
                #byte_stream=BytesIO(buffer)
                #stream = self.__create_bytes_stream(byte_stream)
                streamlen = stream.shape[0]
                idx=0
                while streamlen >= self.chunk:
                    eventpoint=None
                    if first:
                        eventpoint={'status':'start','text':text,'msgenvent':textevent}
                        first = False
                    self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
                    streamlen -= self.chunk
                    idx += self.chunk
        eventpoint={'status':'end','text':text,'msgenvent':textevent}
        self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint) 
 ###########################################################################################
 class VoitsTTS(BaseTTS):
    def txt_to_audio(self,msg):