diff --git a/README.md b/README.md index c3f97b0..b939694 100644 --- a/README.md +++ b/README.md @@ -13,27 +13,28 @@ - 2025.3.2 添加腾讯语音合成服务 - 2025.3.16 支持mac gpu推理,感谢[@GcsSloop](https://github.com/GcsSloop) - 2025.5.1 精简运行参数,ernerf模型移至git分支ernerf-rtmp +- 2025.6.7 添加虚拟摄像头输出 ## Features 1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human 2. 支持声音克隆 3. 支持数字人说话被打断 4. 支持全身视频拼接 -5. 支持rtmp和webrtc -6. 支持视频编排:不说话时播放自定义视频 +5. 支持webrtc、虚拟摄像头输出 +6. 支持动作编排:不说话时播放自定义视频 7. 支持多并发 ## 1. Installation -Tested on Ubuntu 20.04, Python3.10, Pytorch 1.12 and CUDA 11.3 +Tested on Ubuntu 24.04, Python3.10, Pytorch 2.5.0 and CUDA 12.4 ### 1.1 Install dependency ```bash conda create -n nerfstream python=3.10 conda activate nerfstream -#如果cuda版本不为11.3(运行nvidia-smi确认版本),根据安装对应版本的pytorch -conda install pytorch==1.12.1 torchvision==0.13.1 cudatoolkit=11.3 -c pytorch +#如果cuda版本不为12.4(运行nvidia-smi确认版本),根据安装对应版本的pytorch +conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=12.4 -c pytorch -c nvidia pip install -r requirements.txt #如果需要训练ernerf模型,安装下面的库 # pip install "git+https://github.com/facebookresearch/pytorch3d.git" diff --git a/app.py b/app.py index 4f34051..635aeea 100644 --- a/app.py +++ b/app.py @@ -284,7 +284,7 @@ if __name__ == '__main__': parser.add_argument('--model', type=str, default='musetalk') #musetalk wav2lip ultralight - parser.add_argument('--transport', type=str, default='rtcpush') #rtmp webrtc rtcpush + parser.add_argument('--transport', type=str, default='rtcpush') #webrtc rtcpush virtualcam parser.add_argument('--push_url', type=str, default='http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream') #rtmp://localhost/live/livestream parser.add_argument('--max_session', type=int, default=1) #multi session count @@ -326,6 +326,11 @@ if __name__ == '__main__': # nerfreals[0] = build_nerfreal(0) # rendthrd = Thread(target=nerfreals[0].render,args=(thread_quit,)) # rendthrd.start() + if opt.transport=='virtualcam': + thread_quit = Event() + nerfreals[0] = build_nerfreal(0) + rendthrd = Thread(target=nerfreals[0].render,args=(thread_quit,)) + rendthrd.start() ############################################################################# appasync = web.Application(client_max_size=1024**2*100) diff --git a/basereal.py b/basereal.py index 61aad1c..017f5e7 100644 --- a/basereal.py +++ b/basereal.py @@ -32,6 +32,9 @@ from threading import Thread, Event from io import BytesIO import soundfile as sf +import asyncio +from av import AudioFrame, VideoFrame + import av from fractions import Fraction @@ -47,6 +50,23 @@ def read_imgs(img_list): frames.append(frame) return frames +def play_audio(quit_event,queue): + import pyaudio + p = pyaudio.PyAudio() + stream = p.open( + rate=16000, + channels=1, + format=8, + output=True, + output_device_index=1, + ) + stream.start_stream() + # while queue.qsize() <= 0: + # time.sleep(0.1) + while not quit_event.is_set(): + stream.write(queue.get(block=True)) + stream.close() + class BaseReal: def __init__(self, opt): self.opt = opt @@ -268,6 +288,109 @@ class BaseReal: if reinit: self.custom_audio_index[audiotype] = 0 self.custom_index[audiotype] = 0 + + def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): + enable_transition = False # 设置为False禁用过渡效果,True启用 + + if enable_transition: + _last_speaking = False + _transition_start = time.time() + _transition_duration = 0.1 # 过渡时间 + _last_silent_frame = None # 静音帧缓存 + _last_speaking_frame = None # 说话帧缓存 + + if self.opt.transport=='virtualcam': + import pyvirtualcam + vircam = None + + audio_tmp = queue.Queue(maxsize=3000) + audio_thread = Thread(target=play_audio, args=(quit_event,audio_tmp,), daemon=True, name="pyaudio_stream") + audio_thread.start() + + while not quit_event.is_set(): + try: + res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1) + except queue.Empty: + continue + + if enable_transition: + # 检测状态变化 + current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0) + if current_speaking != _last_speaking: + logger.info(f"状态切换:{'说话' if _last_speaking else '静音'} → {'说话' if current_speaking else '静音'}") + _transition_start = time.time() + _last_speaking = current_speaking + + if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据,只需要取fullimg + self.speaking = False + audiotype = audio_frames[0][1] + if self.custom_index.get(audiotype) is not None: #有自定义视频 + mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) + target_frame = self.custom_img_cycle[audiotype][mirindex] + self.custom_index[audiotype] += 1 + else: + target_frame = self.frame_list_cycle[idx] + + if enable_transition: + # 说话→静音过渡 + if time.time() - _transition_start < _transition_duration and _last_speaking_frame is not None: + alpha = min(1.0, (time.time() - _transition_start) / _transition_duration) + combine_frame = cv2.addWeighted(_last_speaking_frame, 1-alpha, target_frame, alpha, 0) + else: + combine_frame = target_frame + # 缓存静音帧 + _last_silent_frame = combine_frame.copy() + else: + combine_frame = target_frame + else: + self.speaking = True + try: + current_frame = self.paste_back_frame(res_frame,idx) + except Exception as e: + logger.warning(f"paste_back_frame error: {e}") + continue + if enable_transition: + # 静音→说话过渡 + if time.time() - _transition_start < _transition_duration and _last_silent_frame is not None: + alpha = min(1.0, (time.time() - _transition_start) / _transition_duration) + combine_frame = cv2.addWeighted(_last_silent_frame, 1-alpha, current_frame, alpha, 0) + else: + combine_frame = current_frame + # 缓存说话帧 + _last_speaking_frame = combine_frame.copy() + else: + combine_frame = current_frame + + if self.opt.transport=='virtualcam': + if vircam==None: + height, width,_= combine_frame.shape + vircam = pyvirtualcam.Camera(width=width, height=height, fps=25, fmt=pyvirtualcam.PixelFormat.BGR,print_fps=True) + vircam.send(combine_frame) + else: #webrtc + image = combine_frame + image[0,:] &= 0xFE + new_frame = VideoFrame.from_ndarray(image, format="bgr24") + asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop) + self.record_video_data(combine_frame) + + for audio_frame in audio_frames: + frame,type,eventpoint = audio_frame + frame = (frame * 32767).astype(np.int16) + + if self.opt.transport=='virtualcam': + audio_tmp.put(frame.tobytes()) #TODO + else: #webrtc + new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0]) + new_frame.planes[0].update(frame.tobytes()) + new_frame.sample_rate=16000 + asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop) + self.record_audio_data(frame) + if self.opt.transport=='virtualcam': + vircam.sleep_until_next_frame() + if self.opt.transport=='virtualcam': + audio_thread.join() + vircam.close() + logger.info('basereal process_frames thread stop') # def process_custom(self,audiotype:int,idx:int): # if self.curr_state!=audiotype: #从推理切到口播 diff --git a/lightreal.py b/lightreal.py index 34a9b69..491df44 100644 --- a/lightreal.py +++ b/lightreal.py @@ -248,60 +248,19 @@ class LightReal(BaseReal): def __del__(self): logger.info(f'lightreal({self.sessionid}) delete') - - def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): - - while not quit_event.is_set(): - try: - res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1) - except queue.Empty: - continue - if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据,只需要取fullimg - self.speaking = False - audiotype = audio_frames[0][1] - if self.custom_index.get(audiotype) is not None: #有自定义视频 - mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) - combine_frame = self.custom_img_cycle[audiotype][mirindex] - self.custom_index[audiotype] += 1 - # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]): - # self.curr_state = 1 #当前视频不循环播放,切换到静音状态 - else: - combine_frame = self.frame_list_cycle[idx] - #combine_frame = self.imagecache.get_img(idx) - else: - self.speaking = True - bbox = self.coord_list_cycle[idx] - combine_frame = copy.deepcopy(self.frame_list_cycle[idx]) - x1, y1, x2, y2 = bbox - - crop_img = self.face_list_cycle[idx] - crop_img_ori = crop_img.copy() - #res_frame = np.array(res_frame, dtype=np.uint8) - try: - crop_img_ori[4:164, 4:164] = res_frame.astype(np.uint8) - crop_img_ori = cv2.resize(crop_img_ori, (x2-x1,y2-y1)) - except: - continue - combine_frame[y1:y2, x1:x2] = crop_img_ori - #print('blending time:',time.perf_counter()-t) - - combine_frame[0,:] &= 0xFE - new_frame = VideoFrame.from_ndarray(combine_frame, format="bgr24") - asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop) - self.record_video_data(combine_frame) - - for audio_frame in audio_frames: - frame,type_,eventpoint = audio_frame - frame = (frame * 32767).astype(np.int16) - new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0]) - new_frame.planes[0].update(frame.tobytes()) - new_frame.sample_rate=16000 - # if audio_track._queue.qsize()>10: - # time.sleep(0.1) - asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop) - self.record_audio_data(frame) - #self.notify(eventpoint) - logger.info('lightreal process_frames thread stop') + def paste_back_frame(self,pred_frame,idx:int): + bbox = self.coord_list_cycle[idx] + combine_frame = copy.deepcopy(self.frame_list_cycle[idx]) + x1, y1, x2, y2 = bbox + + crop_img = self.face_list_cycle[idx] + crop_img_ori = crop_img.copy() + #res_frame = np.array(res_frame, dtype=np.uint8) + + crop_img_ori[4:164, 4:164] = pred_frame.astype(np.uint8) + crop_img_ori = cv2.resize(crop_img_ori, (x2-x1,y2-y1)) + combine_frame[y1:y2, x1:x2] = crop_img_ori + return combine_frame def render(self,quit_event,loop=None,audio_track=None,video_track=None): #if self.opt.asr: @@ -329,7 +288,7 @@ class LightReal(BaseReal): # if video_track._queue.qsize()>=2*self.opt.batch_size: # print('sleep qsize=',video_track._queue.qsize()) # time.sleep(0.04*video_track._queue.qsize()*0.8) - if video_track._queue.qsize()>=5: + if video_track and video_track._queue.qsize()>=5: logger.debug('sleep qsize=%d',video_track._queue.qsize()) time.sleep(0.04*video_track._queue.qsize()*0.8) diff --git a/lipreal.py b/lipreal.py index 57fc1e2..128a749 100644 --- a/lipreal.py +++ b/lipreal.py @@ -206,59 +206,16 @@ class LipReal(BaseReal): def __del__(self): logger.info(f'lipreal({self.sessionid}) delete') - - def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): - - while not quit_event.is_set(): - try: - res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1) - except queue.Empty: - continue - if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据,只需要取fullimg - self.speaking = False - audiotype = audio_frames[0][1] - if self.custom_index.get(audiotype) is not None: #有自定义视频 - mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) - combine_frame = self.custom_img_cycle[audiotype][mirindex] - self.custom_index[audiotype] += 1 - # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]): - # self.curr_state = 1 #当前视频不循环播放,切换到静音状态 - else: - combine_frame = self.frame_list_cycle[idx] - #combine_frame = self.imagecache.get_img(idx) - else: - self.speaking = True - bbox = self.coord_list_cycle[idx] - combine_frame = copy.deepcopy(self.frame_list_cycle[idx]) - #combine_frame = copy.deepcopy(self.imagecache.get_img(idx)) - y1, y2, x1, x2 = bbox - try: - res_frame = cv2.resize(res_frame.astype(np.uint8),(x2-x1,y2-y1)) - except: - continue - #combine_frame = get_image(ori_frame,res_frame,bbox) - #t=time.perf_counter() - combine_frame[y1:y2, x1:x2] = res_frame - #print('blending time:',time.perf_counter()-t) - - image = combine_frame #(outputs['image'] * 255).astype(np.uint8) - image[0,:] &= 0xFE - new_frame = VideoFrame.from_ndarray(image, format="bgr24") - asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop) - self.record_video_data(image) - - for audio_frame in audio_frames: - frame,type,eventpoint = audio_frame - frame = (frame * 32767).astype(np.int16) - new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0]) - new_frame.planes[0].update(frame.tobytes()) - new_frame.sample_rate=16000 - # if audio_track._queue.qsize()>10: - # time.sleep(0.1) - asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop) - self.record_audio_data(frame) - #self.notify(eventpoint) - logger.info('lipreal process_frames thread stop') + def paste_back_frame(self,pred_frame,idx:int): + bbox = self.coord_list_cycle[idx] + combine_frame = copy.deepcopy(self.frame_list_cycle[idx]) + #combine_frame = copy.deepcopy(self.imagecache.get_img(idx)) + y1, y2, x1, x2 = bbox + res_frame = cv2.resize(pred_frame.astype(np.uint8),(x2-x1,y2-y1)) + #combine_frame = get_image(ori_frame,res_frame,bbox) + #t=time.perf_counter() + combine_frame[y1:y2, x1:x2] = res_frame + return combine_frame def render(self,quit_event,loop=None,audio_track=None,video_track=None): #if self.opt.asr: @@ -287,7 +244,7 @@ class LipReal(BaseReal): # if video_track._queue.qsize()>=2*self.opt.batch_size: # print('sleep qsize=',video_track._queue.qsize()) # time.sleep(0.04*video_track._queue.qsize()*0.8) - if video_track._queue.qsize()>=5: + if video_track and video_track._queue.qsize()>=5: logger.debug('sleep qsize=%d',video_track._queue.qsize()) time.sleep(0.04*video_track._queue.qsize()*0.8) diff --git a/musereal.py b/musereal.py index 33c9ee7..1f4a0ad 100644 --- a/musereal.py +++ b/musereal.py @@ -266,92 +266,17 @@ class MuseReal(BaseReal): recon = self.vae.decode_latents(pred_latents) - def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): - enable_transition = True # 设置为False禁用过渡效果,True启用 - - if enable_transition: - self.last_speaking = False - self.transition_start = time.time() - self.transition_duration = 0.1 # 过渡时间 - self.last_silent_frame = None # 静音帧缓存 - self.last_speaking_frame = None # 说话帧缓存 - - while not quit_event.is_set(): - try: - res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1) - except queue.Empty: - continue - - if enable_transition: - # 检测状态变化 - current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0) - if current_speaking != self.last_speaking: - logger.info(f"状态切换:{'说话' if self.last_speaking else '静音'} → {'说话' if current_speaking else '静音'}") - self.transition_start = time.time() - self.last_speaking = current_speaking - - if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: - self.speaking = False - audiotype = audio_frames[0][1] - if self.custom_index.get(audiotype) is not None: - mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype]) - target_frame = self.custom_img_cycle[audiotype][mirindex] - self.custom_index[audiotype] += 1 - else: - target_frame = self.frame_list_cycle[idx] - - if enable_transition: - # 说话→静音过渡 - if time.time() - self.transition_start < self.transition_duration and self.last_speaking_frame is not None: - alpha = min(1.0, (time.time() - self.transition_start) / self.transition_duration) - combine_frame = cv2.addWeighted(self.last_speaking_frame, 1-alpha, target_frame, alpha, 0) - else: - combine_frame = target_frame - # 缓存静音帧 - self.last_silent_frame = combine_frame.copy() - else: - combine_frame = target_frame - else: - self.speaking = True - bbox = self.coord_list_cycle[idx] - ori_frame = copy.deepcopy(self.frame_list_cycle[idx]) - x1, y1, x2, y2 = bbox - try: - res_frame = cv2.resize(res_frame.astype(np.uint8),(x2-x1,y2-y1)) - except Exception as e: - logger.warning(f"resize error: {e}") - continue - mask = self.mask_list_cycle[idx] - mask_crop_box = self.mask_coords_list_cycle[idx] - - current_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box) - if enable_transition: - # 静音→说话过渡 - if time.time() - self.transition_start < self.transition_duration and self.last_silent_frame is not None: - alpha = min(1.0, (time.time() - self.transition_start) / self.transition_duration) - combine_frame = cv2.addWeighted(self.last_silent_frame, 1-alpha, current_frame, alpha, 0) - else: - combine_frame = current_frame - # 缓存说话帧 - self.last_speaking_frame = combine_frame.copy() - else: - combine_frame = current_frame + def paste_back_frame(self,pred_frame,idx:int): + bbox = self.coord_list_cycle[idx] + ori_frame = copy.deepcopy(self.frame_list_cycle[idx]) + x1, y1, x2, y2 = bbox - image = combine_frame - image[0,:] &= 0xFE - new_frame = VideoFrame.from_ndarray(image, format="bgr24") - asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop) - self.record_video_data(image) + res_frame = cv2.resize(pred_frame.astype(np.uint8),(x2-x1,y2-y1)) + mask = self.mask_list_cycle[idx] + mask_crop_box = self.mask_coords_list_cycle[idx] - for audio_frame in audio_frames: - frame,type,eventpoint = audio_frame - frame = (frame * 32767).astype(np.int16) - new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0]) - new_frame.planes[0].update(frame.tobytes()) - new_frame.sample_rate=16000 - asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop) - self.record_audio_data(frame) - logger.info('musereal process_frames thread stop') + combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box) + return combine_frame def render(self,quit_event,loop=None,audio_track=None,video_track=None): #if self.opt.asr: @@ -382,7 +307,7 @@ class MuseReal(BaseReal): # print(f"------actual avg infer fps:{count/totaltime:.4f}") # count=0 # totaltime=0 - if video_track._queue.qsize()>=1.5*self.opt.batch_size: + if video_track and video_track._queue.qsize()>=1.5*self.opt.batch_size: logger.debug('sleep qsize=%d',video_track._queue.qsize()) time.sleep(0.04*video_track._queue.qsize()*0.8) # if video_track._queue.qsize()>=5: