support Ultralight-Digital-Human

7 months ago · 30c812ef73
parent 07ed664c78
commit 30c812ef73
6 changed files with 164 additions and 64 deletions
--- a/README.md
+++ b/README.md
@ -7,10 +7,11 @@ Real time interactive streaming digital human， realize audio video synchronous
 ## News
 - 2024.12.8 完善多并发，显存不随并发数增加
- 2024.12.21 添加wav2lip、musetalk模型预热，解决第一次推理卡顿问题
+- 2024.12.21 添加wav2lip、musetalk模型预热，解决第一次推理卡顿问题。感谢@heimaojinzhangyz
 - 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017
 ## Features
-1. 支持多种数字人模型: ernerf、musetalk、wav2lip
+1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human
 2. 支持声音克隆
 3. 支持数字人说话被打断
 4. 支持全身视频拼接
@ -49,6 +50,7 @@ docker run --rm --env CANDIDATE=$CANDIDATE \
  registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5 \
  objs/srs -c conf/rtc.conf
 ```
 备注：<font color=red>服务端需要开放端口 tcp:8000,8010,1985; udp:8000</font>
 ### 2.2 启动数字人：
@ -62,7 +64,7 @@ export HF_ENDPOINT=https://hf-mirror.com
 ```
 用浏览器打开http://serverip:8010/rtcpushapi.html, 在文本框输入任意文字，提交。数字人播报该段文字  
-备注：服务端需要开放端口 tcp:8000,8010,1985; udp:8000
+
 ## 3. More Usage
 使用说明: <https://livetalking-doc.readthedocs.io/>
@ -81,17 +83,7 @@ docker run --gpus all -it --network=host --rm registry.cn-beijing.aliyuncs.com/c
 可以开放任意端口，不需要单独运行srs服务.
-## 5. 性能分析
+## 5. TODO
 1. 帧率  
 在Tesla T4显卡上测试整体fps为18左右，如果去掉音视频编码推流，帧率在20左右。用4090显卡可以达到40多帧/秒。    
 2. 延时  
 整体延时3s左右  
 （1）tts延时1.7s左右，目前用的edgetts，需要将每句话转完后一次性输入，可以优化tts改成流式输入  
 （2）wav2vec延时0.4s，需要缓存18帧音频做计算 
 （3）srs转发延时，设置srs服务器减少缓冲延时。具体配置可看 https://ossrs.net/lts/zh-cn/docs/v5/doc/low-latency
 ## 6. TODO
 - [x] 添加chatgpt实现数字人对话
 - [x] 声音克隆
 - [x] 数字人静音时用一段视频代替
--- a/app.py
+++ b/app.py
@ -488,7 +488,7 @@ if __name__ == '__main__':
        print(opt)
        model = load_model(opt)
        avatar = load_avatar(opt.avatar_id)
-        warm_up(opt.batch_size,model,160)
+        warm_up(opt.batch_size,avatar,160)
    if opt.transport=='rtmp':
        thread_quit = Event()
--- a/lightasr.py
+++ b/lightasr.py
@ -30,5 +30,5 @@ class LightASR(BaseASR):
        self.feat_queue.put(mel_chunks)
        self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]
-        print(f"Processing audio costs {(time.time() - start_time) * 1000}ms")
+        #print(f"Processing audio costs {(time.time() - start_time) * 1000}ms")
--- a/lightreal.py
+++ b/lightreal.py
@ -61,39 +61,39 @@ device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print('Using {} for inference.'.format(device))
 def load_model(opt):
    audio_processor = Audio2Feature()
-    model = Model(6, 'hubert').to(device)  # 假设Model是你自定义的类
+    return audio_processor
    model.load_state_dict(torch.load('./models/ultralight.pth'))
    model.eval()
    return model,audio_processor
 def load_avatar(avatar_id):
    avatar_path = f"./data/avatars/{avatar_id}"
-    full_imgs_path = f"{avatar_path}/full_body_img" 
+    full_imgs_path = f"{avatar_path}/full_imgs" 
-    land_marks_path = f"{avatar_path}/landmarks" 
+    face_imgs_path = f"{avatar_path}/face_imgs" 
    coords_path = f"{avatar_path}/coords.pkl" 
    model = Model(6, 'hubert').to(device)  # 假设Model是你自定义的类
    model.load_state_dict(torch.load(f"{avatar_path}/ultralight.pth"))
    with open(coords_path, 'rb') as f:
        coord_list_cycle = pickle.load(f)
    input_img_list = glob.glob(os.path.join(full_imgs_path, '*.[jpJP][pnPN]*[gG]'))
    input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
    frame_list_cycle = read_imgs(input_img_list)
    #self.imagecache = ImgCache(len(self.coord_list_cycle),self.full_imgs_path,1000)
-    land_marks_list = glob.glob(os.path.join(land_marks_path, '*.lms'))
+    input_face_list = glob.glob(os.path.join(face_imgs_path, '*.[jpJP][pnPN]*[gG]'))
-    land_marks_list = sorted(land_marks_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
+    input_face_list = sorted(input_face_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
-    lms_list_cycle = read_lms(land_marks_list)
+    face_list_cycle = read_imgs(input_face_list)
-    lms_list_cycle = np.array(lms_list_cycle, dtype=np.int32)
+
-    return frame_list_cycle,lms_list_cycle
+    return model.eval(),frame_list_cycle,face_list_cycle,coord_list_cycle
@torch.no_grad()
-def warm_up(batch_size,model,modelres):
+def warm_up(batch_size,avatar,modelres):
    # ?~D?~C??~G??~U?
    print('warmup model...')
-    model1, audio_processor = model
+    model,_,_,_ = avatar
    img_batch = torch.ones(batch_size, 6, modelres, modelres).to(device)
    mel_batch = torch.ones(batch_size, 32, 32, 32).to(device)
-    model1(img_batch, mel_batch)
+    model(img_batch, mel_batch)
 def read_imgs(img_list):
    frames = []
@ -147,8 +147,8 @@ def __mirror_index(size, index):
        return size - res - 1 
-def inference(quit_event, batch_size, frame_list_cycle, lms_list_cycle, audio_feat_queue, audio_out_queue, res_frame_queue, model):
+def inference(quit_event, batch_size, face_list_cycle, audio_feat_queue, audio_out_queue, res_frame_queue, model):
-    length = len(lms_list_cycle)
+    length = len(face_list_cycle)
    index = 0
    count = 0
    counttime = 0
@ -177,16 +177,11 @@ def inference(quit_event, batch_size, frame_list_cycle, lms_list_cycle, audio_fe
            for i in range(batch_size):
                idx = __mirror_index(length, index + i)
-                face = frame_list_cycle[idx]
+                #face = face_list_cycle[idx]
-                lms = lms_list_cycle[idx]
+                crop_img = face_list_cycle[idx] #face[ymin:ymax, xmin:xmax]
                xmin, ymin = lms[1][0], lms[52][1]
                xmax = lms[31][0]
                width = xmax - xmin
                ymax = ymin + width
                crop_img = face[ymin:ymax, xmin:xmax]
 #                h, w = crop_img.shape[:2]
-                crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA)
+                #crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA)
-                crop_img_ori = crop_img.copy()
+                #crop_img_ori = crop_img.copy()
                img_real_ex = crop_img[4:164, 4:164].copy()
                img_real_ex_ori = img_real_ex.copy()
                img_masked = cv2.rectangle(img_real_ex_ori,(5,5,150,145),(0,0,0),-1)
@ -243,8 +238,8 @@ class LightReal(BaseReal):
        self.idx = 0
        self.res_frame_queue = Queue(self.batch_size*2)  #mp.Queue
        #self.__loadavatar()
-        self.model,audio_processor = model
+        audio_processor = model
-        self.frame_list_cycle,self.lms_list_cycle = avatar
+        self.model,self.frame_list_cycle,self.face_list_cycle,self.coord_list_cycle = avatar
        self.asr = LightASR(opt,self,audio_processor)
        self.asr.warm_up()
@ -277,22 +272,19 @@ class LightReal(BaseReal):
                    #combine_frame = self.imagecache.get_img(idx)
            else:
                self.speaking = True
-                lms = self.lms_list_cycle[idx]
+                bbox = self.coord_list_cycle[idx]
                combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
-                xmin = lms[1][0]
+                x1, y1, x2, y2 = bbox
-                ymin = lms[52][1]
+
-
+                crop_img = self.face_list_cycle[idx]
-                xmax = lms[31][0]
+                crop_img_ori = crop_img.copy()
-                width = xmax - xmin
+                #res_frame = np.array(res_frame, dtype=np.uint8)
-                ymax = ymin + width
+                try:
-                crop_img = combine_frame[ymin:ymax, xmin:xmax]
+                    crop_img_ori[4:164, 4:164] = res_frame.astype(np.uint8)
-                h, w = crop_img.shape[:2]
+                    crop_img_ori = cv2.resize(crop_img_ori, (x2-x1,y2-y1))
-                crop_img_ori = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA).copy()
+                except:
-                #combine_frame = copy.deepcopy(self.imagecache.get_img(idx))
+                    continue
-                res_frame = np.array(res_frame, dtype=np.uint8)
+                combine_frame[y1:y2, x1:x2] = crop_img_ori
                crop_img_ori[4:164, 4:164] = res_frame
                crop_img_ori = cv2.resize(crop_img_ori, (w, h))
                combine_frame[ymin:ymax, xmin:xmax] = crop_img_ori
                #print('blending time:',time.perf_counter()-t)
            new_frame = VideoFrame.from_ndarray(combine_frame, format="bgr24")
@ -319,7 +311,7 @@ class LightReal(BaseReal):
        self.init_customindex()
        process_thread = Thread(target=self.process_frames, args=(quit_event,loop,audio_track,video_track))
        process_thread.start()
-        Thread(target=inference, args=(quit_event,self.batch_size,self.frame_list_cycle,self.lms_list_cycle,self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue,
+        Thread(target=inference, args=(quit_event,self.batch_size,self.face_list_cycle,self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue,
                                           self.model,)).start()  #mp.Process
--- a/ultralight/audio2feature.py
+++ b/ultralight/audio2feature.py
@ -6,8 +6,8 @@ import numpy as np
 class Audio2Feature():
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        self.processor = Wav2Vec2Processor.from_pretrained('./models/hubert-large-ls960-ft')
+        self.processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
-        self.model = HubertModel.from_pretrained('./models/hubert-large-ls960-ft').to(self.device)
+        self.model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft").to(self.device)
    @torch.no_grad()
--- a/ultralight/genavatar.py
+++ b/ultralight/genavatar.py
@ -0,0 +1,116 @@
 import argparse
 import os
 import cv2
 import torch
 import numpy as np
 import torch.nn as nn
 from torch import optim
 from tqdm import tqdm
 from torch.utils.data import DataLoader
 from unet import Model
 import pickle
 # from unet2 import Model
 # from unet_att import Model
 import time
 def osmakedirs(path_list):
    for path in path_list:
        os.makedirs(path) if not os.path.exists(path) else None
 parser = argparse.ArgumentParser(description='Train',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument('--dataset', type=str, default="")  
 #parser.add_argument('--save_path', type=str, default="")     # end with .mp4 please
 parser.add_argument('--checkpoint', type=str, default="")
 parser.add_argument('--avatar_id', default='ultralight_avatar1', type=str)
 args = parser.parse_args()
 checkpoint = args.checkpoint
 dataset_dir = args.dataset
 img_dir = os.path.join(dataset_dir, "full_body_img/")
 lms_dir = os.path.join(dataset_dir, "landmarks/")
 avatar_path = f"./results/avatars/{args.avatar_id}"
 full_imgs_path = f"{avatar_path}/full_imgs" 
 face_imgs_path = f"{avatar_path}/face_imgs" 
 coords_path = f"{avatar_path}/coords.pkl"
 pth_path = f"{avatar_path}/ultralight.pth"
 osmakedirs([avatar_path,full_imgs_path,face_imgs_path])
 len_img = len(os.listdir(img_dir)) - 1
 exm_img = cv2.imread(img_dir+"0.jpg")
 h, w = exm_img.shape[:2]
 step_stride = 0
 img_idx = 0
 coord_list = []
 net = Model(6, 'hubert').cuda()
 net.load_state_dict(torch.load(checkpoint))
 net.eval()
 for i in range(len_img):
    if img_idx>len_img - 1:
        step_stride = -1
    if img_idx<1:
        step_stride = 1
    img_idx += step_stride
    img_path = img_dir + str(img_idx)+'.jpg'
    lms_path = lms_dir + str(img_idx)+'.lms'
    img = cv2.imread(img_path)
    lms_list = []
    with open(lms_path, "r") as f:
        lines = f.read().splitlines()
        for line in lines:
            arr = line.split(" ")
            arr = np.array(arr, dtype=np.float32)
            lms_list.append(arr)
    lms = np.array(lms_list, dtype=np.int32)
    xmin = lms[1][0]
    ymin = lms[52][1]
    xmax = lms[31][0]
    width = xmax - xmin
    ymax = ymin + width
    crop_img = img[ymin:ymax, xmin:xmax]
    h, w = crop_img.shape[:2]
    crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA)
    crop_img_ori = crop_img.copy()
    img_real_ex = crop_img[4:164, 4:164].copy()
    img_real_ex_ori = img_real_ex.copy()
    img_masked = cv2.rectangle(img_real_ex_ori,(5,5,150,145),(0,0,0),-1)
    img_masked = img_masked.transpose(2,0,1).astype(np.float32)
    img_real_ex = img_real_ex.transpose(2,0,1).astype(np.float32)
    img_real_ex_T = torch.from_numpy(img_real_ex / 255.0)
    img_masked_T = torch.from_numpy(img_masked / 255.0)
    img_concat_T = torch.cat([img_real_ex_T, img_masked_T], axis=0)[None]
    audio_feat = torch.zeros(1, 32, 32, 32)
    #print('audio_feat:',audio_feat.shape)
    audio_feat = audio_feat.cuda()
    img_concat_T = img_concat_T.cuda()
    #print('img_concat_T:',img_concat_T.shape)
    with torch.no_grad():
        pred = net(img_concat_T, audio_feat)[0]
    pred = pred.cpu().numpy().transpose(1,2,0)*255
    pred = np.array(pred, dtype=np.uint8)
    crop_img_ori[4:164, 4:164] = pred
    crop_img_ori = cv2.resize(crop_img_ori, (w, h))
    img[ymin:ymax, xmin:xmax] = crop_img_ori
    cv2.putText(img, "LiveTalking", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (128,128,128), 1)
    cv2.imwrite(f"{full_imgs_path}/{img_idx:08d}.png", img)
    cv2.imwrite(f"{face_imgs_path}/{img_idx:08d}.png", crop_img)
    coord_list.append((xmin, ymin, xmin+w, ymin+h))
 with open(coords_path, 'wb') as f:
        pickle.dump(coord_list, f)
 os.system(f"cp {checkpoint} {pth_path}")
 # ffmpeg -i test_video.mp4 -i test_audio.pcm -c:v libx264 -c:a aac result_test.mp4