diff --git a/.gitignore b/.gitignore index 28cf920..d5c3b1e 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,7 @@ pretrained *.mp4 .DS_Store workspace/log_ngp.txt -.idea \ No newline at end of file +.idea +keep_gpu.py +models/ +*.log \ No newline at end of file diff --git a/app.py b/app.py index 1d44282..70a489f 100644 --- a/app.py +++ b/app.py @@ -473,6 +473,7 @@ if __name__ == '__main__': elif opt.transport=='rtcpush': pagename='rtcpushapi.html' logger.info('start http server; http://:'+str(opt.listenport)+'/'+pagename) + logger.info('如果使用webrtc,推荐访问webrtc集成前端: http://:'+str(opt.listenport)+'/dashboard.html') def run_server(runner): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) diff --git a/musereal.py b/musereal.py index 2e75806..447357c 100644 --- a/musereal.py +++ b/musereal.py @@ -267,23 +267,44 @@ class MuseReal(BaseReal): def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): + # 新增状态跟踪变量 + self.last_speaking = False + self.transition_start = time.time() + self.transition_duration = 0.1 # 过渡时间 + self.last_silent_frame = None # 静音帧缓存 + self.last_speaking_frame = None # 说话帧缓存 while not quit_event.is_set(): try: res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1) except queue.Empty: continue - if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据,只需要取fullimg + + # 检测状态变化 + current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0) + if current_speaking != self.last_speaking: + logger.info(f"状态切换:{'静音' if self.last_speaking else '说话'} → {'说话' if current_speaking else '静音'}") + self.transition_start = time.time() + self.last_speaking = current_speaking + + if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: self.speaking = False audiotype = audio_frames[0][1] - if self.custom_index.get(audiotype) is not None: #有自定义视频 + if self.custom_index.get(audiotype) is not None: mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) - combine_frame = self.custom_img_cycle[audiotype][mirindex] + target_frame = self.custom_img_cycle[audiotype][mirindex] self.custom_index[audiotype] += 1 - # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]): - # self.curr_state = 1 #当前视频不循环播放,切换到静音状态 else: - combine_frame = self.frame_list_cycle[idx] + target_frame = self.frame_list_cycle[idx] + + # 说话→静音过渡 + if time.time() - self.transition_start < self.transition_duration and self.last_speaking_frame is not None: + alpha = min(1.0, (time.time() - self.transition_start) / self.transition_duration) + combine_frame = cv2.addWeighted(self.last_speaking_frame, 1-alpha, target_frame, alpha, 0) + else: + combine_frame = target_frame + # 缓存静音帧 + self.last_silent_frame = combine_frame.copy() else: self.speaking = True bbox = self.coord_list_cycle[idx] @@ -291,20 +312,26 @@ class MuseReal(BaseReal): x1, y1, x2, y2 = bbox try: res_frame = cv2.resize(res_frame.astype(np.uint8),(x2-x1,y2-y1)) - except: + except Exception as e: + logger.warning(f"resize error: {e}") continue mask = self.mask_list_cycle[idx] mask_crop_box = self.mask_coords_list_cycle[idx] - #combine_frame = get_image(ori_frame,res_frame,bbox) - #t=time.perf_counter() - combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box) - #print('blending time:',time.perf_counter()-t) - image = combine_frame #(outputs['image'] * 255).astype(np.uint8) + # 静音→说话过渡 + current_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box) + if time.time() - self.transition_start < self.transition_duration and self.last_silent_frame is not None: + alpha = min(1.0, (time.time() - self.transition_start) / self.transition_duration) + combine_frame = cv2.addWeighted(self.last_silent_frame, 1-alpha, current_frame, alpha, 0) + else: + combine_frame = current_frame + # 缓存说话帧 + self.last_speaking_frame = combine_frame.copy() + + image = combine_frame new_frame = VideoFrame.from_ndarray(image, format="bgr24") asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop) self.record_video_data(image) - #self.recordq_video.put(new_frame) for audio_frame in audio_frames: frame,type,eventpoint = audio_frame @@ -312,12 +339,8 @@ class MuseReal(BaseReal): new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0]) new_frame.planes[0].update(frame.tobytes()) new_frame.sample_rate=16000 - # if audio_track._queue.qsize()>10: - # time.sleep(0.1) asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop) self.record_audio_data(frame) - #self.notify(eventpoint) - #self.recordq_audio.put(new_frame) logger.info('musereal process_frames thread stop') def render(self,quit_event,loop=None,audio_track=None,video_track=None): diff --git a/musetalk/utils/face_parsing/resnet.py b/musetalk/utils/face_parsing/resnet.py index a306abb..e2e5d87 100755 --- a/musetalk/utils/face_parsing/resnet.py +++ b/musetalk/utils/face_parsing/resnet.py @@ -80,7 +80,7 @@ class Resnet18(nn.Module): return feat8, feat16, feat32 def init_weight(self, model_path): - state_dict = torch.load(model_path, weights_only=False) #modelzoo.load_url(resnet18_url) + state_dict = torch.load(model_path) #modelzoo.load_url(resnet18_url) self_state_dict = self.state_dict() for k, v in state_dict.items(): if 'fc' in k: continue diff --git a/web/dashboard.html b/web/dashboard.html new file mode 100644 index 0000000..48ad471 --- /dev/null +++ b/web/dashboard.html @@ -0,0 +1,772 @@ + + + + + + livetalking数字人交互平台 + + + + + +
+
+
+

livetalking数字人交互平台

+
+
+ +
+ +
+
+
+
+ + 未连接 +
+
+
+
+ +
+ + 录制中 +
+
+ +
+
+
+ + +
+
+
+ + +
+
+
+ +
+
+
+ + +
+
+
+ +
+
+
+
+ + +
+
+
+
+
+
+
+
+ + +
+
+
+ +
+
+
+ +
+
+
+ 系统: 欢迎使用livetalking,请点击"开始连接"按钮开始对话。 +
+
+ +
+
+ + +
+
+ + +
+ +
+
按住说话,松开发送
+
+ + +
+
+
+ + +
+ +
+
+
+
+
+
+
+ + +
+ + + + + + + + + + + + \ No newline at end of file