From 2b81272a8efe69fc8c42948996244b8c730a3cca Mon Sep 17 00:00:00 2001 From: fanpt <320622572@qq.com> Date: Thu, 28 Aug 2025 15:50:15 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9is=5Fspeaking=E7=8A=B6?= =?UTF-8?q?=E6=80=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 2 +- basereal.py | 73 ++++++++++++++++++++++++++++++----------------------- ttsreal.py | 5 ++-- 3 files changed, 45 insertions(+), 35 deletions(-) diff --git a/app.py b/app.py index 94c3156..2c33e80 100644 --- a/app.py +++ b/app.py @@ -217,7 +217,7 @@ async def human(request): nerfreals[sessionid].flush_talk() if params['type'] == 'echo': - nerfreals[sessionid].speaking = True + nerfreals[sessionid].liv_speaking = True nerfreals[sessionid].put_msg_txt(params['text']) elif params['type'] == 'chat': diff --git a/basereal.py b/basereal.py index 3f8b245..4ddda36 100644 --- a/basereal.py +++ b/basereal.py @@ -50,7 +50,7 @@ def read_imgs(img_list): frames.append(frame) return frames -def play_audio(quit_event,queue): +def play_audio(quit_event,queue): import pyaudio p = pyaudio.PyAudio() stream = p.open( @@ -88,9 +88,11 @@ class BaseReal: self.tts = TencentTTS(opt,self) elif opt.tts == "doubao": self.tts = DoubaoTTS(opt,self) - + self.speaking = False + self.liv_speaking = False + self.recording = False self._record_video_pipe = None self._record_audio_pipe = None @@ -106,11 +108,11 @@ class BaseReal: def put_msg_txt(self,msg,eventpoint=None): self.tts.put_msg_txt(msg,eventpoint) - + def put_audio_frame(self,audio_chunk,eventpoint=None): #16khz 20ms pcm self.asr.put_audio_frame(audio_chunk,eventpoint) - def put_audio_file(self,filebyte): + def put_audio_file(self,filebyte): input_stream = BytesIO(filebyte) stream = self.__create_bytes_stream(input_stream) streamlen = stream.shape[0] @@ -119,7 +121,7 @@ class BaseReal: self.put_audio_frame(stream[idx:idx+self.chunk]) streamlen -= self.chunk idx += self.chunk - + def __create_bytes_stream(self,byte_stream): #byte_stream=BytesIO(buffer) stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64 @@ -129,7 +131,7 @@ class BaseReal: if stream.ndim > 1: logger.info(f'[WARN] audio has {stream.shape[1]} channels, only use the first.') stream = stream[:, 0] - + if sample_rate != self.sample_rate and stream.shape[0]>0: logger.info(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.') stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate) @@ -141,8 +143,8 @@ class BaseReal: self.asr.flush_talk() def is_speaking(self)->bool: - return self.speaking - + return self.liv_speaking + def __loadcustom(self): for item in self.opt.customopt: logger.info(item) @@ -161,8 +163,15 @@ class BaseReal: for key in self.custom_index: self.custom_index[key]=0 - def notify(self,eventpoint): - logger.info("notify:%s",eventpoint) + def notify(self, eventpoint): + logger.info("notify:%s", eventpoint) + # 使用字典的键访问方式,而不是对象属性方式 + if eventpoint['status'] == "start": + self.liv_speaking = True + logger.info("tts start") # 这里可能是笔误,改为start更合理 + if eventpoint['status'] == "end": + self.liv_speaking = False + logger.info("tts end") def start_recording(self): """开始录制视频""" @@ -177,9 +186,9 @@ class BaseReal: '-s', "{}x{}".format(self.width, self.height), '-r', str(25), '-i', '-', - '-pix_fmt', 'yuv420p', + '-pix_fmt', 'yuv420p', '-vcodec', "h264", - #'-f' , 'flv', + #'-f' , 'flv', f'temp{self.opt.sessionid}.mp4'] self._record_video_pipe = subprocess.Popen(command, shell=False, stdin=subprocess.PIPE) @@ -191,7 +200,7 @@ class BaseReal: '-ar', '16000', '-i', '-', '-acodec', 'aac', - #'-f' , 'wav', + #'-f' , 'wav', f'temp{self.opt.sessionid}.aac'] self._record_audio_pipe = subprocess.Popen(acommand, shell=False, stdin=subprocess.PIPE) @@ -199,10 +208,10 @@ class BaseReal: # self.recordq_video.queue.clear() # self.recordq_audio.queue.clear() # self.container = av.open(path, mode="w") - + # process_thread = Thread(target=self.record_frame, args=()) # process_thread.start() - + def record_video_data(self,image): if self.width == 0: print("image.shape:",image.shape) @@ -213,14 +222,14 @@ class BaseReal: def record_audio_data(self,frame): if self.recording: self._record_audio_pipe.stdin.write(frame.tostring()) - - # def record_frame(self): + + # def record_frame(self): # videostream = self.container.add_stream("libx264", rate=25) # videostream.codec_context.time_base = Fraction(1, 25) # audiostream = self.container.add_stream("aac") # audiostream.codec_context.time_base = Fraction(1, 16000) # init = True - # framenum = 0 + # framenum = 0 # while self.recording: # try: # videoframe = self.recordq_video.get(block=True, timeout=1) @@ -253,18 +262,18 @@ class BaseReal: # self.recordq_video.queue.clear() # self.recordq_audio.queue.clear() # print('record thread stop') - + def stop_recording(self): """停止录制视频""" if not self.recording: return - self.recording = False - self._record_video_pipe.stdin.close() #wait() + self.recording = False + self._record_video_pipe.stdin.close() #wait() self._record_video_pipe.wait() self._record_audio_pipe.stdin.close() self._record_audio_pipe.wait() cmd_combine_audio = f"ffmpeg -y -i temp{self.opt.sessionid}.aac -i temp{self.opt.sessionid}.mp4 -c:v copy -c:a copy data/record.mp4" - os.system(cmd_combine_audio) + os.system(cmd_combine_audio) #os.remove(output_path) def mirror_index(self,size, index): @@ -274,8 +283,8 @@ class BaseReal: if turn % 2 == 0: return res else: - return size - res - 1 - + return size - res - 1 + def get_audio_stream(self,audiotype): idx = self.custom_audio_index[audiotype] stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk] @@ -283,7 +292,7 @@ class BaseReal: if self.custom_audio_index[audiotype]>=self.custom_audio_cycle[audiotype].shape[0]: self.curr_state = 1 #当前视频不循环播放,切换到静音状态 return stream - + def set_custom_state(self,audiotype, reinit=True): print('set_custom_state:',audiotype) if self.custom_audio_index.get(audiotype) is None: @@ -295,14 +304,14 @@ class BaseReal: def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): enable_transition = False # 设置为False禁用过渡效果,True启用 - + if enable_transition: _last_speaking = False _transition_start = time.time() _transition_duration = 0.1 # 过渡时间 _last_silent_frame = None # 静音帧缓存 _last_speaking_frame = None # 说话帧缓存 - + if self.opt.transport=='virtualcam': import pyvirtualcam vircam = None @@ -310,13 +319,13 @@ class BaseReal: audio_tmp = queue.Queue(maxsize=3000) audio_thread = Thread(target=play_audio, args=(quit_event,audio_tmp,), daemon=True, name="pyaudio_stream") audio_thread.start() - + while not quit_event.is_set(): try: res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1) except queue.Empty: continue - + if enable_transition: # 检测状态变化 current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0) @@ -334,7 +343,7 @@ class BaseReal: self.custom_index[audiotype] += 1 else: target_frame = self.frame_list_cycle[idx] - + if enable_transition: # 说话→静音过渡 if time.time() - _transition_start < _transition_duration and _last_speaking_frame is not None: @@ -394,8 +403,8 @@ class BaseReal: if self.opt.transport=='virtualcam': audio_thread.join() vircam.close() - logger.info('basereal process_frames thread stop') - + logger.info('basereal process_frames thread stop') + # def process_custom(self,audiotype:int,idx:int): # if self.curr_state!=audiotype: #从推理切到口播 # if idx in self.switch_pos: #在卡点位置可以切换 diff --git a/ttsreal.py b/ttsreal.py index 186ff59..6b75217 100644 --- a/ttsreal.py +++ b/ttsreal.py @@ -253,7 +253,8 @@ class SovitsTTS(BaseTTS): 'prompt_text':reftext, 'prompt_lang':language, 'media_type':'ogg', - 'streaming_mode':True + 'streaming_mode':True, + "speed_factor":1.2 } # req["text"] = text # req["text_language"] = language @@ -467,7 +468,7 @@ class TencentTTS(BaseTTS): try: res = requests.post(url, headers=headers, data=json.dumps(params), stream=True) - + end = time.perf_counter() logger.info(f"tencent Time to make POST: {end-start}s")