add fish-speech tts

main
lipku 6 months ago
parent 3d9d16a2fb
commit 9a56d7cb66

@ -9,7 +9,8 @@ Real time interactive streaming digital human realize audio video synchronous
- 2024.12.8 完善多并发,显存不随并发数增加
- 2024.12.21 添加wav2lip、musetalk模型预热解决第一次推理卡顿问题。感谢@heimaojinzhangyz
- 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017
- 2025.1.26 添加wav2lip384模型 感谢@不蠢不蠢
- 2025.1.26 添加wav2lip384开源模型 感谢@不蠢不蠢
- 2025.2.7 添加fish-speech tts
## Features
1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human

@ -35,7 +35,7 @@ import soundfile as sf
import av
from fractions import Fraction
from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS
from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS,FishTTS
from tqdm import tqdm
def read_imgs(img_list):
@ -61,6 +61,8 @@ class BaseReal:
self.tts = XTTS(opt,self)
elif opt.tts == "cosyvoice":
self.tts = CosyVoiceTTS(opt,self)
elif opt.tts == "fishtts":
self.tts = FishTTS(opt,self)
self.speaking = False

@ -139,6 +139,82 @@ class EdgeTTS(BaseTTS):
except Exception as e:
print(e)
###########################################################################################
class FishTTS(BaseTTS):
def txt_to_audio(self,msg):
text,textevent = msg
self.stream_tts(
self.fish_speech(
text,
self.opt.REF_FILE,
self.opt.REF_TEXT,
"zh", #en args.language,
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
),
msg
)
def fish_speech(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
start = time.perf_counter()
req={
'text':text,
'reference_id':reffile,
'format':'wav',
'streaming':True,
'use_memory_cache':'on'
}
try:
res = requests.post(
f"{server_url}/v1/tts",
json=req,
stream=True,
headers={
"content-type": "application/json",
},
)
end = time.perf_counter()
print(f"fish_speech Time to make POST: {end-start}s")
if res.status_code != 200:
print("Error:", res.text)
return
first = True
for chunk in res.iter_content(chunk_size=17640): # 1764 44100*20ms*2
#print('chunk len:',len(chunk))
if first:
end = time.perf_counter()
print(f"fish_speech Time to first chunk: {end-start}s")
first = False
if chunk and self.state==State.RUNNING:
yield chunk
#print("gpt_sovits response.elapsed:", res.elapsed)
except Exception as e:
print(e)
def stream_tts(self,audio_stream,msg):
text,textevent = msg
first = True
for chunk in audio_stream:
if chunk is not None and len(chunk)>0:
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
stream = resampy.resample(x=stream, sr_orig=44100, sr_new=self.sample_rate)
#byte_stream=BytesIO(buffer)
#stream = self.__create_bytes_stream(byte_stream)
streamlen = stream.shape[0]
idx=0
while streamlen >= self.chunk:
eventpoint=None
if first:
eventpoint={'status':'start','text':text,'msgenvent':textevent}
first = False
self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
streamlen -= self.chunk
idx += self.chunk
eventpoint={'status':'end','text':text,'msgenvent':textevent}
self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint)
###########################################################################################
class VoitsTTS(BaseTTS):
def txt_to_audio(self,msg):

Loading…
Cancel
Save