From 2b81272a8efe69fc8c42948996244b8c730a3cca Mon Sep 17 00:00:00 2001
From: fanpt <320622572@qq.com>
Date: Thu, 28 Aug 2025 15:50:15 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9is=5Fspeaking=E7=8A=B6?=
 =?UTF-8?q?=E6=80=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app.py      |  2 +-
 basereal.py | 73 ++++++++++++++++++++++++++++++-----------------------
 ttsreal.py  |  5 ++--
 3 files changed, 45 insertions(+), 35 deletions(-)

diff --git a/app.py b/app.py
index 94c3156..2c33e80 100644
--- a/app.py
+++ b/app.py
@@ -217,7 +217,7 @@ async def human(request):
             nerfreals[sessionid].flush_talk()
 
         if params['type'] == 'echo':
-            nerfreals[sessionid].speaking = True
+            nerfreals[sessionid].liv_speaking = True
             nerfreals[sessionid].put_msg_txt(params['text'])
 
         elif params['type'] == 'chat':
diff --git a/basereal.py b/basereal.py
index 3f8b245..4ddda36 100644
--- a/basereal.py
+++ b/basereal.py
@@ -50,7 +50,7 @@ def read_imgs(img_list):
         frames.append(frame)
     return frames
 
-def play_audio(quit_event,queue):        
+def play_audio(quit_event,queue):
     import pyaudio
     p = pyaudio.PyAudio()
     stream = p.open(
@@ -88,9 +88,11 @@ class BaseReal:
             self.tts = TencentTTS(opt,self)
         elif opt.tts == "doubao":
             self.tts = DoubaoTTS(opt,self)
-        
+
         self.speaking = False
 
+        self.liv_speaking = False
+
         self.recording = False
         self._record_video_pipe = None
         self._record_audio_pipe = None
@@ -106,11 +108,11 @@ class BaseReal:
 
     def put_msg_txt(self,msg,eventpoint=None):
         self.tts.put_msg_txt(msg,eventpoint)
-    
+
     def put_audio_frame(self,audio_chunk,eventpoint=None): #16khz 20ms pcm
         self.asr.put_audio_frame(audio_chunk,eventpoint)
 
-    def put_audio_file(self,filebyte): 
+    def put_audio_file(self,filebyte):
         input_stream = BytesIO(filebyte)
         stream = self.__create_bytes_stream(input_stream)
         streamlen = stream.shape[0]
@@ -119,7 +121,7 @@ class BaseReal:
             self.put_audio_frame(stream[idx:idx+self.chunk])
             streamlen -= self.chunk
             idx += self.chunk
-    
+
     def __create_bytes_stream(self,byte_stream):
         #byte_stream=BytesIO(buffer)
         stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
@@ -129,7 +131,7 @@ class BaseReal:
         if stream.ndim > 1:
             logger.info(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
             stream = stream[:, 0]
-    
+
         if sample_rate != self.sample_rate and stream.shape[0]>0:
             logger.info(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
             stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
@@ -141,8 +143,8 @@ class BaseReal:
         self.asr.flush_talk()
 
     def is_speaking(self)->bool:
-        return self.speaking
-    
+        return self.liv_speaking
+
     def __loadcustom(self):
         for item in self.opt.customopt:
             logger.info(item)
@@ -161,8 +163,15 @@ class BaseReal:
         for key in self.custom_index:
             self.custom_index[key]=0
 
-    def notify(self,eventpoint):
-        logger.info("notify:%s",eventpoint)
+    def notify(self, eventpoint):
+        logger.info("notify:%s", eventpoint)
+        # 使用字典的键访问方式，而不是对象属性方式
+        if eventpoint['status'] == "start":
+            self.liv_speaking = True
+            logger.info("tts start")  # 这里可能是笔误，改为start更合理
+        if eventpoint['status'] == "end":
+            self.liv_speaking = False
+            logger.info("tts end")
 
     def start_recording(self):
         """开始录制视频"""
@@ -177,9 +186,9 @@ class BaseReal:
                     '-s', "{}x{}".format(self.width, self.height),
                     '-r', str(25),
                     '-i', '-',
-                    '-pix_fmt', 'yuv420p', 
+                    '-pix_fmt', 'yuv420p',
                     '-vcodec', "h264",
-                    #'-f' , 'flv',                  
+                    #'-f' , 'flv',
                     f'temp{self.opt.sessionid}.mp4']
         self._record_video_pipe = subprocess.Popen(command, shell=False, stdin=subprocess.PIPE)
 
@@ -191,7 +200,7 @@ class BaseReal:
                     '-ar', '16000',
                     '-i', '-',
                     '-acodec', 'aac',
-                    #'-f' , 'wav',                  
+                    #'-f' , 'wav',
                     f'temp{self.opt.sessionid}.aac']
         self._record_audio_pipe = subprocess.Popen(acommand, shell=False, stdin=subprocess.PIPE)
 
@@ -199,10 +208,10 @@ class BaseReal:
         # self.recordq_video.queue.clear()
         # self.recordq_audio.queue.clear()
         # self.container = av.open(path, mode="w")
-    
+
         # process_thread = Thread(target=self.record_frame, args=())
         # process_thread.start()
-    
+
     def record_video_data(self,image):
         if self.width == 0:
             print("image.shape:",image.shape)
@@ -213,14 +222,14 @@ class BaseReal:
     def record_audio_data(self,frame):
         if self.recording:
             self._record_audio_pipe.stdin.write(frame.tostring())
-    
-    # def record_frame(self): 
+
+    # def record_frame(self):
     #     videostream = self.container.add_stream("libx264", rate=25)
     #     videostream.codec_context.time_base = Fraction(1, 25)
     #     audiostream = self.container.add_stream("aac")
     #     audiostream.codec_context.time_base = Fraction(1, 16000)
     #     init = True
-    #     framenum = 0       
+    #     framenum = 0
     #     while self.recording:
     #         try:
     #             videoframe = self.recordq_video.get(block=True, timeout=1)
@@ -253,18 +262,18 @@ class BaseReal:
     #     self.recordq_video.queue.clear()
     #     self.recordq_audio.queue.clear()
     #     print('record thread stop')
-		
+
     def stop_recording(self):
         """停止录制视频"""
         if not self.recording:
             return
-        self.recording = False 
-        self._record_video_pipe.stdin.close()  #wait() 
+        self.recording = False
+        self._record_video_pipe.stdin.close()  #wait()
         self._record_video_pipe.wait()
         self._record_audio_pipe.stdin.close()
         self._record_audio_pipe.wait()
         cmd_combine_audio = f"ffmpeg -y -i temp{self.opt.sessionid}.aac -i temp{self.opt.sessionid}.mp4 -c:v copy -c:a copy data/record.mp4"
-        os.system(cmd_combine_audio) 
+        os.system(cmd_combine_audio)
         #os.remove(output_path)
 
     def mirror_index(self,size, index):
@@ -274,8 +283,8 @@ class BaseReal:
         if turn % 2 == 0:
             return res
         else:
-            return size - res - 1 
-    
+            return size - res - 1
+
     def get_audio_stream(self,audiotype):
         idx = self.custom_audio_index[audiotype]
         stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk]
@@ -283,7 +292,7 @@ class BaseReal:
         if self.custom_audio_index[audiotype]>=self.custom_audio_cycle[audiotype].shape[0]:
             self.curr_state = 1  #当前视频不循环播放，切换到静音状态
         return stream
-    
+
     def set_custom_state(self,audiotype, reinit=True):
         print('set_custom_state:',audiotype)
         if self.custom_audio_index.get(audiotype) is None:
@@ -295,14 +304,14 @@ class BaseReal:
 
     def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
         enable_transition = False  # 设置为False禁用过渡效果，True启用
-        
+
         if enable_transition:
             _last_speaking = False
             _transition_start = time.time()
             _transition_duration = 0.1  # 过渡时间
             _last_silent_frame = None  # 静音帧缓存
             _last_speaking_frame = None  # 说话帧缓存
-        
+
         if self.opt.transport=='virtualcam':
             import pyvirtualcam
             vircam = None
@@ -310,13 +319,13 @@ class BaseReal:
             audio_tmp = queue.Queue(maxsize=3000)
             audio_thread = Thread(target=play_audio, args=(quit_event,audio_tmp,), daemon=True, name="pyaudio_stream")
             audio_thread.start()
-        
+
         while not quit_event.is_set():
             try:
                 res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
             except queue.Empty:
                 continue
-            
+
             if enable_transition:
                 # 检测状态变化
                 current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0)
@@ -334,7 +343,7 @@ class BaseReal:
                     self.custom_index[audiotype] += 1
                 else:
                     target_frame = self.frame_list_cycle[idx]
-                
+
                 if enable_transition:
                     # 说话→静音过渡
                     if time.time() - _transition_start < _transition_duration and _last_speaking_frame is not None:
@@ -394,8 +403,8 @@ class BaseReal:
         if self.opt.transport=='virtualcam':
             audio_thread.join()
             vircam.close()
-        logger.info('basereal process_frames thread stop') 
-    
+        logger.info('basereal process_frames thread stop')
+
     # def process_custom(self,audiotype:int,idx:int):
     #     if self.curr_state!=audiotype: #从推理切到口播
     #         if idx in self.switch_pos:  #在卡点位置可以切换
diff --git a/ttsreal.py b/ttsreal.py
index 186ff59..6b75217 100644
--- a/ttsreal.py
+++ b/ttsreal.py
@@ -253,7 +253,8 @@ class SovitsTTS(BaseTTS):
             'prompt_text':reftext,
             'prompt_lang':language,
             'media_type':'ogg',
-            'streaming_mode':True
+            'streaming_mode':True,
+            "speed_factor":1.2
         }
         # req["text"] = text
         # req["text_language"] = language
@@ -467,7 +468,7 @@ class TencentTTS(BaseTTS):
         try:
             res = requests.post(url, headers=headers,
                           data=json.dumps(params), stream=True)
-            
+
             end = time.perf_counter()
             logger.info(f"tencent Time to make POST: {end-start}s")