From 9a56d7cb666a1ac69587abb95a522c7dbeef7f98 Mon Sep 17 00:00:00 2001
From: lipku <lipku@foxmail.com>
Date: Fri, 7 Feb 2025 08:01:19 +0800
Subject: [PATCH] add fish-speech tts

---
 README.md   |  3 ++-
 basereal.py |  4 ++-
 ttsreal.py  | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index de3a14d..ba6c7a6 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,8 @@ Real time interactive streaming digital human， realize audio video synchronous
 - 2024.12.8 完善多并发，显存不随并发数增加
 - 2024.12.21 添加wav2lip、musetalk模型预热，解决第一次推理卡顿问题。感谢@heimaojinzhangyz
 - 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017
-- 2025.1.26 添加wav2lip384模型 感谢@不蠢不蠢
+- 2025.1.26 添加wav2lip384开源模型 感谢@不蠢不蠢
+- 2025.2.7 添加fish-speech tts
 
 ## Features
 1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human
diff --git a/basereal.py b/basereal.py
index 5136aba..939217b 100644
--- a/basereal.py
+++ b/basereal.py
@@ -35,7 +35,7 @@ import soundfile as sf
 import av
 from fractions import Fraction
 
-from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS
+from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS,FishTTS
 
 from tqdm import tqdm
 def read_imgs(img_list):
@@ -61,6 +61,8 @@ class BaseReal:
             self.tts = XTTS(opt,self)
         elif opt.tts == "cosyvoice":
             self.tts = CosyVoiceTTS(opt,self)
+        elif opt.tts == "fishtts":
+            self.tts = FishTTS(opt,self)
         
         self.speaking = False
 
diff --git a/ttsreal.py b/ttsreal.py
index ed3a1da..7ddf12e 100644
--- a/ttsreal.py
+++ b/ttsreal.py
@@ -139,6 +139,82 @@ class EdgeTTS(BaseTTS):
         except Exception as e:
             print(e)
 
+###########################################################################################
+class FishTTS(BaseTTS):
+    def txt_to_audio(self,msg): 
+        text,textevent = msg
+        self.stream_tts(
+            self.fish_speech(
+                text,
+                self.opt.REF_FILE,  
+                self.opt.REF_TEXT,
+                "zh", #en args.language,
+                self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
+            ),
+            msg
+        )
+
+    def fish_speech(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
+        start = time.perf_counter()
+        req={
+            'text':text,
+            'reference_id':reffile,
+            'format':'wav',
+            'streaming':True,
+            'use_memory_cache':'on'
+        }
+        try:
+            res = requests.post(
+                f"{server_url}/v1/tts",
+                json=req,
+                stream=True,
+                headers={
+                    "content-type": "application/json",
+                },
+            )
+            end = time.perf_counter()
+            print(f"fish_speech Time to make POST: {end-start}s")
+
+            if res.status_code != 200:
+                print("Error:", res.text)
+                return
+                
+            first = True
+        
+            for chunk in res.iter_content(chunk_size=17640): # 1764 44100*20ms*2
+                #print('chunk len:',len(chunk))
+                if first:
+                    end = time.perf_counter()
+                    print(f"fish_speech Time to first chunk: {end-start}s")
+                    first = False
+                if chunk and self.state==State.RUNNING:
+                    yield chunk
+            #print("gpt_sovits response.elapsed:", res.elapsed)
+        except Exception as e:
+            print(e)
+
+    def stream_tts(self,audio_stream,msg):
+        text,textevent = msg
+        first = True
+        for chunk in audio_stream:
+            if chunk is not None and len(chunk)>0:          
+                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
+                stream = resampy.resample(x=stream, sr_orig=44100, sr_new=self.sample_rate)
+                #byte_stream=BytesIO(buffer)
+                #stream = self.__create_bytes_stream(byte_stream)
+                streamlen = stream.shape[0]
+                idx=0
+                while streamlen >= self.chunk:
+                    eventpoint=None
+                    if first:
+                        eventpoint={'status':'start','text':text,'msgenvent':textevent}
+                        first = False
+                    self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
+                    streamlen -= self.chunk
+                    idx += self.chunk
+        eventpoint={'status':'end','text':text,'msgenvent':textevent}
+        self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint) 
+
 ###########################################################################################
 class VoitsTTS(BaseTTS):
     def txt_to_audio(self,msg):