add fish-speech tts

main
lipku 6 months ago
parent 3d9d16a2fb
commit 9a56d7cb66

@ -9,7 +9,8 @@ Real time interactive streaming digital human realize audio video synchronous
- 2024.12.8 完善多并发,显存不随并发数增加 - 2024.12.8 完善多并发,显存不随并发数增加
- 2024.12.21 添加wav2lip、musetalk模型预热解决第一次推理卡顿问题。感谢@heimaojinzhangyz - 2024.12.21 添加wav2lip、musetalk模型预热解决第一次推理卡顿问题。感谢@heimaojinzhangyz
- 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017 - 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017
- 2025.1.26 添加wav2lip384模型 感谢@不蠢不蠢 - 2025.1.26 添加wav2lip384开源模型 感谢@不蠢不蠢
- 2025.2.7 添加fish-speech tts
## Features ## Features
1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human 1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human

@ -35,7 +35,7 @@ import soundfile as sf
import av import av
from fractions import Fraction from fractions import Fraction
from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS,FishTTS
from tqdm import tqdm from tqdm import tqdm
def read_imgs(img_list): def read_imgs(img_list):
@ -61,6 +61,8 @@ class BaseReal:
self.tts = XTTS(opt,self) self.tts = XTTS(opt,self)
elif opt.tts == "cosyvoice": elif opt.tts == "cosyvoice":
self.tts = CosyVoiceTTS(opt,self) self.tts = CosyVoiceTTS(opt,self)
elif opt.tts == "fishtts":
self.tts = FishTTS(opt,self)
self.speaking = False self.speaking = False

@ -139,6 +139,82 @@ class EdgeTTS(BaseTTS):
except Exception as e: except Exception as e:
print(e) print(e)
###########################################################################################
class FishTTS(BaseTTS):
def txt_to_audio(self,msg):
text,textevent = msg
self.stream_tts(
self.fish_speech(
text,
self.opt.REF_FILE,
self.opt.REF_TEXT,
"zh", #en args.language,
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
),
msg
)
def fish_speech(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
start = time.perf_counter()
req={
'text':text,
'reference_id':reffile,
'format':'wav',
'streaming':True,
'use_memory_cache':'on'
}
try:
res = requests.post(
f"{server_url}/v1/tts",
json=req,
stream=True,
headers={
"content-type": "application/json",
},
)
end = time.perf_counter()
print(f"fish_speech Time to make POST: {end-start}s")
if res.status_code != 200:
print("Error:", res.text)
return
first = True
for chunk in res.iter_content(chunk_size=17640): # 1764 44100*20ms*2
#print('chunk len:',len(chunk))
if first:
end = time.perf_counter()
print(f"fish_speech Time to first chunk: {end-start}s")
first = False
if chunk and self.state==State.RUNNING:
yield chunk
#print("gpt_sovits response.elapsed:", res.elapsed)
except Exception as e:
print(e)
def stream_tts(self,audio_stream,msg):
text,textevent = msg
first = True
for chunk in audio_stream:
if chunk is not None and len(chunk)>0:
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
stream = resampy.resample(x=stream, sr_orig=44100, sr_new=self.sample_rate)
#byte_stream=BytesIO(buffer)
#stream = self.__create_bytes_stream(byte_stream)
streamlen = stream.shape[0]
idx=0
while streamlen >= self.chunk:
eventpoint=None
if first:
eventpoint={'status':'start','text':text,'msgenvent':textevent}
first = False
self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
streamlen -= self.chunk
idx += self.chunk
eventpoint={'status':'end','text':text,'msgenvent':textevent}
self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint)
########################################################################################### ###########################################################################################
class VoitsTTS(BaseTTS): class VoitsTTS(BaseTTS):
def txt_to_audio(self,msg): def txt_to_audio(self,msg):

Loading…
Cancel
Save