optimize tts stream

main
lipku 9 months ago
parent 794679d6be
commit 9259754425

@ -29,7 +29,7 @@ pip install "git+https://github.com/facebookresearch/pytorch3d.git"
pip install tensorflow-gpu==2.8.0 pip install tensorflow-gpu==2.8.0
pip install --upgrade "protobuf<=3.20.1" pip install --upgrade "protobuf<=3.20.1"
``` ```
如果用pytorch2.1torchvision用0.16可以去torchvision官网根据pytorch版本找匹配的,cudatoolkit可以不用装 如果用pytorch2.1torchvision用0.16(可以去[torchvision官网](https://github.com/pytorch/vision)根据pytorch版本找匹配的,cudatoolkit可以不用装
安装常见问题[FAQ](/assets/faq.md) 安装常见问题[FAQ](/assets/faq.md)
linux cuda环境搭建可以参考这篇文章 https://zhuanlan.zhihu.com/p/674972886 linux cuda环境搭建可以参考这篇文章 https://zhuanlan.zhihu.com/p/674972886

@ -138,7 +138,7 @@ class VoitsTTS(BaseTTS):
'ref_audio_path':reffile, 'ref_audio_path':reffile,
'prompt_text':reftext, 'prompt_text':reftext,
'prompt_lang':language, 'prompt_lang':language,
'media_type':'raw', 'media_type':'ogg',
'streaming_mode':True 'streaming_mode':True
} }
# req["text"] = text # req["text"] = text
@ -162,7 +162,8 @@ class VoitsTTS(BaseTTS):
first = True first = True
for chunk in res.iter_content(chunk_size=12800): # 1280 32K*20ms*2 for chunk in res.iter_content(chunk_size=None): #12800 1280 32K*20ms*2
print('chunk len:',len(chunk))
if first: if first:
end = time.perf_counter() end = time.perf_counter()
print(f"gpt_sovits Time to first chunk: {end-start}s") print(f"gpt_sovits Time to first chunk: {end-start}s")
@ -173,13 +174,29 @@ class VoitsTTS(BaseTTS):
except Exception as e: except Exception as e:
print(e) print(e)
def __create_bytes_stream(self,byte_stream):
#byte_stream=BytesIO(buffer)
stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
stream = stream.astype(np.float32)
if stream.ndim > 1:
print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
stream = stream[:, 0]
if sample_rate != self.sample_rate and stream.shape[0]>0:
print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
return stream
def stream_tts(self,audio_stream): def stream_tts(self,audio_stream):
for chunk in audio_stream: for chunk in audio_stream:
if chunk is not None and len(chunk)>0: if chunk is not None and len(chunk)>0:
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767 #stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate) #stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
#byte_stream=BytesIO(buffer) byte_stream=BytesIO(chunk)
#stream = self.__create_bytes_stream(byte_stream) stream = self.__create_bytes_stream(byte_stream)
streamlen = stream.shape[0] streamlen = stream.shape[0]
idx=0 idx=0
while streamlen >= self.chunk: while streamlen >= self.chunk:

Loading…
Cancel
Save