Implement process_frame in the parent class BaseReal and add virtual camera output

2 months ago · 9e5a56678b
parent 8ab089f3a0
commit 9e5a56678b
6 changed files with 170 additions and 200 deletions
--- a/README.md
+++ b/README.md
@ -13,27 +13,28 @@
 - 2025.3.2 添加腾讯语音合成服务
 - 2025.3.16 支持mac gpu推理，感谢[@GcsSloop](https://github.com/GcsSloop) 
 - 2025.5.1 精简运行参数，ernerf模型移至git分支ernerf-rtmp
+- 2025.6.7 添加虚拟摄像头输出

 ## Features
 1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human
 2. 支持声音克隆
 3. 支持数字人说话被打断
 4. 支持全身视频拼接
-5. 支持rtmp和webrtc
-6. 支持视频编排：不说话时播放自定义视频
+5. 支持webrtc、虚拟摄像头输出
+6. 支持动作编排：不说话时播放自定义视频
 7. 支持多并发

 ## 1. Installation

-Tested on Ubuntu 20.04, Python3.10, Pytorch 1.12 and CUDA 11.3
+Tested on Ubuntu 24.04, Python3.10, Pytorch 2.5.0 and CUDA 12.4

 ### 1.1 Install dependency

 ```bash
 conda create -n nerfstream python=3.10
 conda activate nerfstream
-#如果cuda版本不为11.3(运行nvidia-smi确认版本)，根据<https://pytorch.org/get-started/previous-versions/>安装对应版本的pytorch 
-conda install pytorch==1.12.1 torchvision==0.13.1 cudatoolkit=11.3 -c pytorch
+#如果cuda版本不为12.4(运行nvidia-smi确认版本)，根据<https://pytorch.org/get-started/previous-versions/>安装对应版本的pytorch 
+conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=12.4 -c pytorch -c nvidia
 pip install -r requirements.txt
 #如果需要训练ernerf模型，安装下面的库
 # pip install "git+https://github.com/facebookresearch/pytorch3d.git"
--- a/app.py
+++ b/app.py
@ -284,7 +284,7 @@ if __name__ == '__main__':

    parser.add_argument('--model', type=str, default='musetalk') #musetalk wav2lip ultralight

-    parser.add_argument('--transport', type=str, default='rtcpush') #rtmp webrtc rtcpush
+    parser.add_argument('--transport', type=str, default='rtcpush') #webrtc rtcpush virtualcam
    parser.add_argument('--push_url', type=str, default='http://localhost:1985/rtc/v1/whip/?app=live&stream=livestream') #rtmp://localhost/live/livestream

    parser.add_argument('--max_session', type=int, default=1)  #multi session count
@ -326,6 +326,11 @@ if __name__ == '__main__':
    #     nerfreals[0] = build_nerfreal(0)
    #     rendthrd = Thread(target=nerfreals[0].render,args=(thread_quit,))
    #     rendthrd.start()
+    if opt.transport=='virtualcam':
+        thread_quit = Event()
+        nerfreals[0] = build_nerfreal(0)
+        rendthrd = Thread(target=nerfreals[0].render,args=(thread_quit,))
+        rendthrd.start()

    #############################################################################
    appasync = web.Application(client_max_size=1024**2*100)
--- a/basereal.py
+++ b/basereal.py
@ -32,6 +32,9 @@ from threading import Thread, Event
 from io import BytesIO
 import soundfile as sf

+import asyncio
+from av import AudioFrame, VideoFrame
+
 import av
 from fractions import Fraction

@ -47,6 +50,23 @@ def read_imgs(img_list):
        frames.append(frame)
    return frames

+def play_audio(quit_event,queue):        
+    import pyaudio
+    p = pyaudio.PyAudio()
+    stream = p.open(
+        rate=16000,
+        channels=1,
+        format=8,
+        output=True,
+        output_device_index=1,
+    )
+    stream.start_stream()
+    # while queue.qsize() <= 0:
+    #     time.sleep(0.1)
+    while not quit_event.is_set():
+        stream.write(queue.get(block=True))
+    stream.close()
+
 class BaseReal:
    def __init__(self, opt):
        self.opt = opt
@ -268,6 +288,109 @@ class BaseReal:
        if reinit:
            self.custom_audio_index[audiotype] = 0
            self.custom_index[audiotype] = 0
+
+    def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
+        enable_transition = False  # 设置为False禁用过渡效果，True启用
+        
+        if enable_transition:
+            _last_speaking = False
+            _transition_start = time.time()
+            _transition_duration = 0.1  # 过渡时间
+            _last_silent_frame = None  # 静音帧缓存
+            _last_speaking_frame = None  # 说话帧缓存
+        
+        if self.opt.transport=='virtualcam':
+            import pyvirtualcam
+            vircam = None
+
+            audio_tmp = queue.Queue(maxsize=3000)
+            audio_thread = Thread(target=play_audio, args=(quit_event,audio_tmp,), daemon=True, name="pyaudio_stream")
+            audio_thread.start()
+        
+        while not quit_event.is_set():
+            try:
+                res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
+            except queue.Empty:
+                continue
+            
+            if enable_transition:
+                # 检测状态变化
+                current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0)
+                if current_speaking != _last_speaking:
+                    logger.info(f"状态切换：{'说话' if _last_speaking else '静音'} → {'说话' if current_speaking else '静音'}")
+                    _transition_start = time.time()
+                _last_speaking = current_speaking
+
+            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
+                self.speaking = False
+                audiotype = audio_frames[0][1]
+                if self.custom_index.get(audiotype) is not None: #有自定义视频
+                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
+                    target_frame = self.custom_img_cycle[audiotype][mirindex]
+                    self.custom_index[audiotype] += 1
+                else:
+                    target_frame = self.frame_list_cycle[idx]
+                
+                if enable_transition:
+                    # 说话→静音过渡
+                    if time.time() - _transition_start < _transition_duration and _last_speaking_frame is not None:
+                        alpha = min(1.0, (time.time() - _transition_start) / _transition_duration)
+                        combine_frame = cv2.addWeighted(_last_speaking_frame, 1-alpha, target_frame, alpha, 0)
+                    else:
+                        combine_frame = target_frame
+                    # 缓存静音帧
+                    _last_silent_frame = combine_frame.copy()
+                else:
+                    combine_frame = target_frame
+            else:
+                self.speaking = True
+                try:
+                    current_frame = self.paste_back_frame(res_frame,idx)
+                except Exception as e:
+                    logger.warning(f"paste_back_frame error: {e}")
+                    continue
+                if enable_transition:
+                    # 静音→说话过渡
+                    if time.time() - _transition_start < _transition_duration and _last_silent_frame is not None:
+                        alpha = min(1.0, (time.time() - _transition_start) / _transition_duration)
+                        combine_frame = cv2.addWeighted(_last_silent_frame, 1-alpha, current_frame, alpha, 0)
+                    else:
+                        combine_frame = current_frame
+                    # 缓存说话帧
+                    _last_speaking_frame = combine_frame.copy()
+                else:
+                    combine_frame = current_frame
+
+            if self.opt.transport=='virtualcam':
+                if vircam==None:
+                    height, width,_= combine_frame.shape
+                    vircam = pyvirtualcam.Camera(width=width, height=height, fps=25, fmt=pyvirtualcam.PixelFormat.BGR,print_fps=True)
+                vircam.send(combine_frame)
+            else: #webrtc
+                image = combine_frame
+                image[0,:] &= 0xFE
+                new_frame = VideoFrame.from_ndarray(image, format="bgr24")
+                asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop)
+            self.record_video_data(combine_frame)
+
+            for audio_frame in audio_frames:
+                frame,type,eventpoint = audio_frame
+                frame = (frame * 32767).astype(np.int16)
+
+                if self.opt.transport=='virtualcam':
+                    audio_tmp.put(frame.tobytes()) #TODO
+                else: #webrtc
+                    new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
+                    new_frame.planes[0].update(frame.tobytes())
+                    new_frame.sample_rate=16000
+                    asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop)
+                self.record_audio_data(frame)
+            if self.opt.transport=='virtualcam':
+                vircam.sleep_until_next_frame()
+        if self.opt.transport=='virtualcam':
+            audio_thread.join()
+            vircam.close()
+        logger.info('basereal process_frames thread stop') 
    
    # def process_custom(self,audiotype:int,idx:int):
    #     if self.curr_state!=audiotype: #从推理切到口播
--- a/lightreal.py
+++ b/lightreal.py
@ -248,60 +248,19 @@ class LightReal(BaseReal):
    def __del__(self):
        logger.info(f'lightreal({self.sessionid}) delete')

-   
-    def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
-        
-        while not quit_event.is_set():
-            try:
-                res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
-            except queue.Empty:
-                continue
-            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
-                self.speaking = False
-                audiotype = audio_frames[0][1]
-                if self.custom_index.get(audiotype) is not None: #有自定义视频
-                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
-                    combine_frame = self.custom_img_cycle[audiotype][mirindex]
-                    self.custom_index[audiotype] += 1
-                    # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]):
-                    #     self.curr_state = 1  #当前视频不循环播放，切换到静音状态
-                else:
-                    combine_frame = self.frame_list_cycle[idx]
-                    #combine_frame = self.imagecache.get_img(idx)
-            else:
-                self.speaking = True
-                bbox = self.coord_list_cycle[idx]
-                combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
-                x1, y1, x2, y2 = bbox
-
-                crop_img = self.face_list_cycle[idx]
-                crop_img_ori = crop_img.copy()
-                #res_frame = np.array(res_frame, dtype=np.uint8)
-                try:
-                    crop_img_ori[4:164, 4:164] = res_frame.astype(np.uint8)
-                    crop_img_ori = cv2.resize(crop_img_ori, (x2-x1,y2-y1))
-                except:
-                    continue
-                combine_frame[y1:y2, x1:x2] = crop_img_ori
-                #print('blending time:',time.perf_counter()-t)
-
-            combine_frame[0,:] &= 0xFE
-            new_frame = VideoFrame.from_ndarray(combine_frame, format="bgr24")
-            asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop)
-            self.record_video_data(combine_frame)
-
-            for audio_frame in audio_frames:
-                frame,type_,eventpoint = audio_frame
-                frame = (frame * 32767).astype(np.int16)
-                new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
-                new_frame.planes[0].update(frame.tobytes())
-                new_frame.sample_rate=16000
-                # if audio_track._queue.qsize()>10:
-                #     time.sleep(0.1)
-                asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop)
-                self.record_audio_data(frame)
-                #self.notify(eventpoint)
-        logger.info('lightreal process_frames thread stop') 
+    def paste_back_frame(self,pred_frame,idx:int):
+        bbox = self.coord_list_cycle[idx]
+        combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
+        x1, y1, x2, y2 = bbox
+
+        crop_img = self.face_list_cycle[idx]
+        crop_img_ori = crop_img.copy()
+        #res_frame = np.array(res_frame, dtype=np.uint8)
+
+        crop_img_ori[4:164, 4:164] = pred_frame.astype(np.uint8)
+        crop_img_ori = cv2.resize(crop_img_ori, (x2-x1,y2-y1))
+        combine_frame[y1:y2, x1:x2] = crop_img_ori
+        return combine_frame
            
    def render(self,quit_event,loop=None,audio_track=None,video_track=None):
        #if self.opt.asr:
@ -329,7 +288,7 @@ class LightReal(BaseReal):
            # if video_track._queue.qsize()>=2*self.opt.batch_size:
            #     print('sleep qsize=',video_track._queue.qsize())
            #     time.sleep(0.04*video_track._queue.qsize()*0.8)
-            if video_track._queue.qsize()>=5:
+            if video_track and video_track._queue.qsize()>=5:
                logger.debug('sleep qsize=%d',video_track._queue.qsize())
                time.sleep(0.04*video_track._queue.qsize()*0.8)
                
--- a/lipreal.py
+++ b/lipreal.py
@ -206,59 +206,16 @@ class LipReal(BaseReal):
    def __del__(self):
        logger.info(f'lipreal({self.sessionid}) delete')

-   
-    def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
-        
-        while not quit_event.is_set():
-            try:
-                res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
-            except queue.Empty:
-                continue
-            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
-                self.speaking = False
-                audiotype = audio_frames[0][1]
-                if self.custom_index.get(audiotype) is not None: #有自定义视频
-                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
-                    combine_frame = self.custom_img_cycle[audiotype][mirindex]
-                    self.custom_index[audiotype] += 1
-                    # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]):
-                    #     self.curr_state = 1  #当前视频不循环播放，切换到静音状态
-                else:
-                    combine_frame = self.frame_list_cycle[idx]
-                    #combine_frame = self.imagecache.get_img(idx)
-            else:
-                self.speaking = True
-                bbox = self.coord_list_cycle[idx]
-                combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
-                #combine_frame = copy.deepcopy(self.imagecache.get_img(idx))
-                y1, y2, x1, x2 = bbox
-                try:
-                    res_frame = cv2.resize(res_frame.astype(np.uint8),(x2-x1,y2-y1))
-                except:
-                    continue
-                #combine_frame = get_image(ori_frame,res_frame,bbox)
-                #t=time.perf_counter()
-                combine_frame[y1:y2, x1:x2] = res_frame
-                #print('blending time:',time.perf_counter()-t)
-
-            image = combine_frame #(outputs['image'] * 255).astype(np.uint8)
-            image[0,:] &= 0xFE
-            new_frame = VideoFrame.from_ndarray(image, format="bgr24")
-            asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop)
-            self.record_video_data(image)
-
-            for audio_frame in audio_frames:
-                frame,type,eventpoint = audio_frame
-                frame = (frame * 32767).astype(np.int16)
-                new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
-                new_frame.planes[0].update(frame.tobytes())
-                new_frame.sample_rate=16000
-                # if audio_track._queue.qsize()>10:
-                #     time.sleep(0.1)
-                asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop)
-                self.record_audio_data(frame)
-                #self.notify(eventpoint)
-        logger.info('lipreal process_frames thread stop') 
+    def paste_back_frame(self,pred_frame,idx:int):
+        bbox = self.coord_list_cycle[idx]
+        combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
+        #combine_frame = copy.deepcopy(self.imagecache.get_img(idx))
+        y1, y2, x1, x2 = bbox
+        res_frame = cv2.resize(pred_frame.astype(np.uint8),(x2-x1,y2-y1))
+        #combine_frame = get_image(ori_frame,res_frame,bbox)
+        #t=time.perf_counter()
+        combine_frame[y1:y2, x1:x2] = res_frame
+        return combine_frame
            
    def render(self,quit_event,loop=None,audio_track=None,video_track=None):
        #if self.opt.asr:
@ -287,7 +244,7 @@ class LipReal(BaseReal):
            # if video_track._queue.qsize()>=2*self.opt.batch_size:
            #     print('sleep qsize=',video_track._queue.qsize())
            #     time.sleep(0.04*video_track._queue.qsize()*0.8)
-            if video_track._queue.qsize()>=5:
+            if video_track and video_track._queue.qsize()>=5:
                logger.debug('sleep qsize=%d',video_track._queue.qsize())
                time.sleep(0.04*video_track._queue.qsize()*0.8)
                
--- a/musereal.py
+++ b/musereal.py
@ -266,92 +266,17 @@ class MuseReal(BaseReal):
        recon = self.vae.decode_latents(pred_latents)
      

-    def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
-        enable_transition = True  # 设置为False禁用过渡效果，True启用
-        
-        if enable_transition:
-            self.last_speaking = False
-            self.transition_start = time.time()
-            self.transition_duration = 0.1  # 过渡时间
-            self.last_silent_frame = None  # 静音帧缓存
-            self.last_speaking_frame = None  # 说话帧缓存
-        
-        while not quit_event.is_set():
-            try:
-                res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
-            except queue.Empty:
-                continue
-            
-            if enable_transition:
-                # 检测状态变化
-                current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0)
-                if current_speaking != self.last_speaking:
-                    logger.info(f"状态切换：{'说话' if self.last_speaking else '静音'} → {'说话' if current_speaking else '静音'}")
-                    self.transition_start = time.time()
-                self.last_speaking = current_speaking
-
-            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: 
-                self.speaking = False
-                audiotype = audio_frames[0][1]
-                if self.custom_index.get(audiotype) is not None:
-                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
-                    target_frame = self.custom_img_cycle[audiotype][mirindex]
-                    self.custom_index[audiotype] += 1
-                else:
-                    target_frame = self.frame_list_cycle[idx]
-                
-                if enable_transition:
-                    # 说话→静音过渡
-                    if time.time() - self.transition_start < self.transition_duration and self.last_speaking_frame is not None:
-                        alpha = min(1.0, (time.time() - self.transition_start) / self.transition_duration)
-                        combine_frame = cv2.addWeighted(self.last_speaking_frame, 1-alpha, target_frame, alpha, 0)
-                    else:
-                        combine_frame = target_frame
-                    # 缓存静音帧
-                    self.last_silent_frame = combine_frame.copy()
-                else:
-                    combine_frame = target_frame
-            else:
-                self.speaking = True
-                bbox = self.coord_list_cycle[idx]
-                ori_frame = copy.deepcopy(self.frame_list_cycle[idx])
-                x1, y1, x2, y2 = bbox
-                try:
-                    res_frame = cv2.resize(res_frame.astype(np.uint8),(x2-x1,y2-y1))
-                except Exception as e:
-                    logger.warning(f"resize error: {e}")
-                    continue
-                mask = self.mask_list_cycle[idx]
-                mask_crop_box = self.mask_coords_list_cycle[idx]
-
-                current_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box)
-                if enable_transition:
-                    # 静音→说话过渡
-                    if time.time() - self.transition_start < self.transition_duration and self.last_silent_frame is not None:
-                        alpha = min(1.0, (time.time() - self.transition_start) / self.transition_duration)
-                        combine_frame = cv2.addWeighted(self.last_silent_frame, 1-alpha, current_frame, alpha, 0)
-                    else:
-                        combine_frame = current_frame
-                    # 缓存说话帧
-                    self.last_speaking_frame = combine_frame.copy()
-                else:
-                    combine_frame = current_frame
+    def paste_back_frame(self,pred_frame,idx:int):
+        bbox = self.coord_list_cycle[idx]
+        ori_frame = copy.deepcopy(self.frame_list_cycle[idx])
+        x1, y1, x2, y2 = bbox

-            image = combine_frame
-            image[0,:] &= 0xFE
-            new_frame = VideoFrame.from_ndarray(image, format="bgr24")
-            asyncio.run_coroutine_threadsafe(video_track._queue.put((new_frame,None)), loop)
-            self.record_video_data(image)
+        res_frame = cv2.resize(pred_frame.astype(np.uint8),(x2-x1,y2-y1))
+        mask = self.mask_list_cycle[idx]
+        mask_crop_box = self.mask_coords_list_cycle[idx]

-            for audio_frame in audio_frames:
-                frame,type,eventpoint = audio_frame
-                frame = (frame * 32767).astype(np.int16)
-                new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
-                new_frame.planes[0].update(frame.tobytes())
-                new_frame.sample_rate=16000
-                asyncio.run_coroutine_threadsafe(audio_track._queue.put((new_frame,eventpoint)), loop)
-                self.record_audio_data(frame)
-        logger.info('musereal process_frames thread stop') 
+        combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box)
+        return combine_frame
            
    def render(self,quit_event,loop=None,audio_track=None,video_track=None):
        #if self.opt.asr:
@ -382,7 +307,7 @@ class MuseReal(BaseReal):
            #     print(f"------actual avg infer fps:{count/totaltime:.4f}")
            #     count=0
            #     totaltime=0
-            if video_track._queue.qsize()>=1.5*self.opt.batch_size:
+            if video_track and video_track._queue.qsize()>=1.5*self.opt.batch_size:
                logger.debug('sleep qsize=%d',video_track._queue.qsize())
                time.sleep(0.04*video_track._queue.qsize()*0.8)
            # if video_track._queue.qsize()>=5: