optimize tts stream

9 months ago · 9259754425
parent 794679d6be
commit 9259754425
2 changed files with 24 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -29,7 +29,7 @@ pip install "git+https://github.com/facebookresearch/pytorch3d.git"
 pip install tensorflow-gpu==2.8.0
 pip install --upgrade "protobuf<=3.20.1"
 ```
-如果用pytorch2.1，torchvision用0.16（可以去torchvision官网根据pytorch版本找匹配的）,cudatoolkit可以不用装  
+如果用pytorch2.1，torchvision用0.16（可以去[torchvision官网](https://github.com/pytorch/vision)根据pytorch版本找匹配的）,cudatoolkit可以不用装  
 安装常见问题[FAQ](/assets/faq.md)  
 linux cuda环境搭建可以参考这篇文章 https://zhuanlan.zhihu.com/p/674972886
--- a/ttsreal.py
+++ b/ttsreal.py
@ -138,7 +138,7 @@ class VoitsTTS(BaseTTS):
            'ref_audio_path':reffile,
            'prompt_text':reftext,
            'prompt_lang':language,
-            'media_type':'raw',
+            'media_type':'ogg',
            'streaming_mode':True
        }
        # req["text"] = text
@ -162,7 +162,8 @@ class VoitsTTS(BaseTTS):
            first = True
-            for chunk in res.iter_content(chunk_size=12800): # 1280 32K*20ms*2
+            for chunk in res.iter_content(chunk_size=None): #12800 1280 32K*20ms*2
                print('chunk len:',len(chunk))
                if first:
                    end = time.perf_counter()
                    print(f"gpt_sovits Time to first chunk: {end-start}s")
@ -173,13 +174,29 @@ class VoitsTTS(BaseTTS):
        except Exception as e:
            print(e)
    def __create_bytes_stream(self,byte_stream):
        #byte_stream=BytesIO(buffer)
        stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
        print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
        stream = stream.astype(np.float32)
        if stream.ndim > 1:
            print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
            stream = stream[:, 0]
        if sample_rate != self.sample_rate and stream.shape[0]>0:
            print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
        return stream
    def stream_tts(self,audio_stream):
        for chunk in audio_stream:
            if chunk is not None and len(chunk)>0:          
-                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
+                #stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
-                stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
+                #stream = resampy.resample(x=stream, sr_orig=32000, sr_new=self.sample_rate)
-                #byte_stream=BytesIO(buffer)
+                byte_stream=BytesIO(chunk)
-                #stream = self.__create_bytes_stream(byte_stream)
+                stream = self.__create_bytes_stream(byte_stream)
                streamlen = stream.shape[0]
                idx=0
                while streamlen >= self.chunk: