diff --git a/README.md b/README.md index ef67847..a8a86c7 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,11 @@ Real time interactive streaming digital human, realize audio video synchronous ## News - 2024.12.8 完善多并发,显存不随并发数增加 -- 2024.12.21 添加wav2lip、musetalk模型预热,解决第一次推理卡顿问题 +- 2024.12.21 添加wav2lip、musetalk模型预热,解决第一次推理卡顿问题。感谢@heimaojinzhangyz +- 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017 ## Features -1. 支持多种数字人模型: ernerf、musetalk、wav2lip +1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human 2. 支持声音克隆 3. 支持数字人说话被打断 4. 支持全身视频拼接 @@ -49,6 +50,7 @@ docker run --rm --env CANDIDATE=$CANDIDATE \ registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5 \ objs/srs -c conf/rtc.conf ``` +备注:服务端需要开放端口 tcp:8000,8010,1985; udp:8000 ### 2.2 启动数字人: @@ -62,7 +64,7 @@ export HF_ENDPOINT=https://hf-mirror.com ``` 用浏览器打开http://serverip:8010/rtcpushapi.html, 在文本框输入任意文字,提交。数字人播报该段文字 -备注:服务端需要开放端口 tcp:8000,8010,1985; udp:8000 + ## 3. More Usage 使用说明: @@ -81,17 +83,7 @@ docker run --gpus all -it --network=host --rm registry.cn-beijing.aliyuncs.com/c 可以开放任意端口,不需要单独运行srs服务. -## 5. 性能分析 -1. 帧率 -在Tesla T4显卡上测试整体fps为18左右,如果去掉音视频编码推流,帧率在20左右。用4090显卡可以达到40多帧/秒。 -2. 延时 -整体延时3s左右 -(1)tts延时1.7s左右,目前用的edgetts,需要将每句话转完后一次性输入,可以优化tts改成流式输入 -(2)wav2vec延时0.4s,需要缓存18帧音频做计算 -(3)srs转发延时,设置srs服务器减少缓冲延时。具体配置可看 https://ossrs.net/lts/zh-cn/docs/v5/doc/low-latency - - -## 6. TODO +## 5. TODO - [x] 添加chatgpt实现数字人对话 - [x] 声音克隆 - [x] 数字人静音时用一段视频代替 diff --git a/app.py b/app.py index f665c7c..a87cd6d 100644 --- a/app.py +++ b/app.py @@ -488,7 +488,7 @@ if __name__ == '__main__': print(opt) model = load_model(opt) avatar = load_avatar(opt.avatar_id) - warm_up(opt.batch_size,model,160) + warm_up(opt.batch_size,avatar,160) if opt.transport=='rtmp': thread_quit = Event() diff --git a/lightasr.py b/lightasr.py index 08020f3..b3df50f 100644 --- a/lightasr.py +++ b/lightasr.py @@ -30,5 +30,5 @@ class LightASR(BaseASR): self.feat_queue.put(mel_chunks) self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):] - print(f"Processing audio costs {(time.time() - start_time) * 1000}ms") + #print(f"Processing audio costs {(time.time() - start_time) * 1000}ms") diff --git a/lightreal.py b/lightreal.py index 61e02c4..443424f 100644 --- a/lightreal.py +++ b/lightreal.py @@ -61,39 +61,39 @@ device = 'cuda' if torch.cuda.is_available() else 'cpu' print('Using {} for inference.'.format(device)) - def load_model(opt): audio_processor = Audio2Feature() - model = Model(6, 'hubert').to(device) # 假设Model是你自定义的类 - model.load_state_dict(torch.load('./models/ultralight.pth')) - model.eval() - - return model,audio_processor + return audio_processor def load_avatar(avatar_id): avatar_path = f"./data/avatars/{avatar_id}" - full_imgs_path = f"{avatar_path}/full_body_img" - land_marks_path = f"{avatar_path}/landmarks" + full_imgs_path = f"{avatar_path}/full_imgs" + face_imgs_path = f"{avatar_path}/face_imgs" + coords_path = f"{avatar_path}/coords.pkl" + + model = Model(6, 'hubert').to(device) # 假设Model是你自定义的类 + model.load_state_dict(torch.load(f"{avatar_path}/ultralight.pth")) + with open(coords_path, 'rb') as f: + coord_list_cycle = pickle.load(f) input_img_list = glob.glob(os.path.join(full_imgs_path, '*.[jpJP][pnPN]*[gG]')) input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) frame_list_cycle = read_imgs(input_img_list) #self.imagecache = ImgCache(len(self.coord_list_cycle),self.full_imgs_path,1000) - land_marks_list = glob.glob(os.path.join(land_marks_path, '*.lms')) - land_marks_list = sorted(land_marks_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) - lms_list_cycle = read_lms(land_marks_list) - lms_list_cycle = np.array(lms_list_cycle, dtype=np.int32) - return frame_list_cycle,lms_list_cycle + input_face_list = glob.glob(os.path.join(face_imgs_path, '*.[jpJP][pnPN]*[gG]')) + input_face_list = sorted(input_face_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0])) + face_list_cycle = read_imgs(input_face_list) + + return model.eval(),frame_list_cycle,face_list_cycle,coord_list_cycle @torch.no_grad() -def warm_up(batch_size,model,modelres): - # ?~D?~C??~G??~U? +def warm_up(batch_size,avatar,modelres): print('warmup model...') - model1, audio_processor = model + model,_,_,_ = avatar img_batch = torch.ones(batch_size, 6, modelres, modelres).to(device) mel_batch = torch.ones(batch_size, 32, 32, 32).to(device) - model1(img_batch, mel_batch) + model(img_batch, mel_batch) def read_imgs(img_list): frames = [] @@ -147,8 +147,8 @@ def __mirror_index(size, index): return size - res - 1 -def inference(quit_event, batch_size, frame_list_cycle, lms_list_cycle, audio_feat_queue, audio_out_queue, res_frame_queue, model): - length = len(lms_list_cycle) +def inference(quit_event, batch_size, face_list_cycle, audio_feat_queue, audio_out_queue, res_frame_queue, model): + length = len(face_list_cycle) index = 0 count = 0 counttime = 0 @@ -177,16 +177,11 @@ def inference(quit_event, batch_size, frame_list_cycle, lms_list_cycle, audio_fe for i in range(batch_size): idx = __mirror_index(length, index + i) - face = frame_list_cycle[idx] - lms = lms_list_cycle[idx] - xmin, ymin = lms[1][0], lms[52][1] - xmax = lms[31][0] - width = xmax - xmin - ymax = ymin + width - crop_img = face[ymin:ymax, xmin:xmax] + #face = face_list_cycle[idx] + crop_img = face_list_cycle[idx] #face[ymin:ymax, xmin:xmax] # h, w = crop_img.shape[:2] - crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA) - crop_img_ori = crop_img.copy() + #crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA) + #crop_img_ori = crop_img.copy() img_real_ex = crop_img[4:164, 4:164].copy() img_real_ex_ori = img_real_ex.copy() img_masked = cv2.rectangle(img_real_ex_ori,(5,5,150,145),(0,0,0),-1) @@ -243,8 +238,8 @@ class LightReal(BaseReal): self.idx = 0 self.res_frame_queue = Queue(self.batch_size*2) #mp.Queue #self.__loadavatar() - self.model,audio_processor = model - self.frame_list_cycle,self.lms_list_cycle = avatar + audio_processor = model + self.model,self.frame_list_cycle,self.face_list_cycle,self.coord_list_cycle = avatar self.asr = LightASR(opt,self,audio_processor) self.asr.warm_up() @@ -277,22 +272,19 @@ class LightReal(BaseReal): #combine_frame = self.imagecache.get_img(idx) else: self.speaking = True - lms = self.lms_list_cycle[idx] + bbox = self.coord_list_cycle[idx] combine_frame = copy.deepcopy(self.frame_list_cycle[idx]) - xmin = lms[1][0] - ymin = lms[52][1] - - xmax = lms[31][0] - width = xmax - xmin - ymax = ymin + width - crop_img = combine_frame[ymin:ymax, xmin:xmax] - h, w = crop_img.shape[:2] - crop_img_ori = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA).copy() - #combine_frame = copy.deepcopy(self.imagecache.get_img(idx)) - res_frame = np.array(res_frame, dtype=np.uint8) - crop_img_ori[4:164, 4:164] = res_frame - crop_img_ori = cv2.resize(crop_img_ori, (w, h)) - combine_frame[ymin:ymax, xmin:xmax] = crop_img_ori + x1, y1, x2, y2 = bbox + + crop_img = self.face_list_cycle[idx] + crop_img_ori = crop_img.copy() + #res_frame = np.array(res_frame, dtype=np.uint8) + try: + crop_img_ori[4:164, 4:164] = res_frame.astype(np.uint8) + crop_img_ori = cv2.resize(crop_img_ori, (x2-x1,y2-y1)) + except: + continue + combine_frame[y1:y2, x1:x2] = crop_img_ori #print('blending time:',time.perf_counter()-t) new_frame = VideoFrame.from_ndarray(combine_frame, format="bgr24") @@ -319,7 +311,7 @@ class LightReal(BaseReal): self.init_customindex() process_thread = Thread(target=self.process_frames, args=(quit_event,loop,audio_track,video_track)) process_thread.start() - Thread(target=inference, args=(quit_event,self.batch_size,self.frame_list_cycle,self.lms_list_cycle,self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue, + Thread(target=inference, args=(quit_event,self.batch_size,self.face_list_cycle,self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue, self.model,)).start() #mp.Process diff --git a/ultralight/audio2feature.py b/ultralight/audio2feature.py index dae7b08..0cc777f 100644 --- a/ultralight/audio2feature.py +++ b/ultralight/audio2feature.py @@ -6,8 +6,8 @@ import numpy as np class Audio2Feature(): def __init__(self): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' - self.processor = Wav2Vec2Processor.from_pretrained('./models/hubert-large-ls960-ft') - self.model = HubertModel.from_pretrained('./models/hubert-large-ls960-ft').to(self.device) + self.processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft") + self.model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft").to(self.device) @torch.no_grad() diff --git a/ultralight/genavatar.py b/ultralight/genavatar.py new file mode 100644 index 0000000..6d7c8de --- /dev/null +++ b/ultralight/genavatar.py @@ -0,0 +1,116 @@ +import argparse +import os +import cv2 +import torch +import numpy as np +import torch.nn as nn +from torch import optim +from tqdm import tqdm +from torch.utils.data import DataLoader +from unet import Model +import pickle +# from unet2 import Model +# from unet_att import Model + +import time +def osmakedirs(path_list): + for path in path_list: + os.makedirs(path) if not os.path.exists(path) else None + +parser = argparse.ArgumentParser(description='Train', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + +parser.add_argument('--dataset', type=str, default="") +#parser.add_argument('--save_path', type=str, default="") # end with .mp4 please +parser.add_argument('--checkpoint', type=str, default="") +parser.add_argument('--avatar_id', default='ultralight_avatar1', type=str) +args = parser.parse_args() + +checkpoint = args.checkpoint +dataset_dir = args.dataset + +img_dir = os.path.join(dataset_dir, "full_body_img/") +lms_dir = os.path.join(dataset_dir, "landmarks/") + +avatar_path = f"./results/avatars/{args.avatar_id}" +full_imgs_path = f"{avatar_path}/full_imgs" +face_imgs_path = f"{avatar_path}/face_imgs" +coords_path = f"{avatar_path}/coords.pkl" +pth_path = f"{avatar_path}/ultralight.pth" +osmakedirs([avatar_path,full_imgs_path,face_imgs_path]) + +len_img = len(os.listdir(img_dir)) - 1 +exm_img = cv2.imread(img_dir+"0.jpg") +h, w = exm_img.shape[:2] + +step_stride = 0 +img_idx = 0 +coord_list = [] + +net = Model(6, 'hubert').cuda() +net.load_state_dict(torch.load(checkpoint)) +net.eval() +for i in range(len_img): + if img_idx>len_img - 1: + step_stride = -1 + if img_idx<1: + step_stride = 1 + img_idx += step_stride + img_path = img_dir + str(img_idx)+'.jpg' + lms_path = lms_dir + str(img_idx)+'.lms' + + img = cv2.imread(img_path) + lms_list = [] + with open(lms_path, "r") as f: + lines = f.read().splitlines() + for line in lines: + arr = line.split(" ") + arr = np.array(arr, dtype=np.float32) + lms_list.append(arr) + lms = np.array(lms_list, dtype=np.int32) + xmin = lms[1][0] + ymin = lms[52][1] + + xmax = lms[31][0] + width = xmax - xmin + ymax = ymin + width + crop_img = img[ymin:ymax, xmin:xmax] + h, w = crop_img.shape[:2] + crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA) + crop_img_ori = crop_img.copy() + img_real_ex = crop_img[4:164, 4:164].copy() + img_real_ex_ori = img_real_ex.copy() + img_masked = cv2.rectangle(img_real_ex_ori,(5,5,150,145),(0,0,0),-1) + + img_masked = img_masked.transpose(2,0,1).astype(np.float32) + img_real_ex = img_real_ex.transpose(2,0,1).astype(np.float32) + + img_real_ex_T = torch.from_numpy(img_real_ex / 255.0) + img_masked_T = torch.from_numpy(img_masked / 255.0) + img_concat_T = torch.cat([img_real_ex_T, img_masked_T], axis=0)[None] + + audio_feat = torch.zeros(1, 32, 32, 32) + #print('audio_feat:',audio_feat.shape) + audio_feat = audio_feat.cuda() + img_concat_T = img_concat_T.cuda() + #print('img_concat_T:',img_concat_T.shape) + + with torch.no_grad(): + pred = net(img_concat_T, audio_feat)[0] + + pred = pred.cpu().numpy().transpose(1,2,0)*255 + pred = np.array(pred, dtype=np.uint8) + crop_img_ori[4:164, 4:164] = pred + crop_img_ori = cv2.resize(crop_img_ori, (w, h)) + img[ymin:ymax, xmin:xmax] = crop_img_ori + + cv2.putText(img, "LiveTalking", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (128,128,128), 1) + cv2.imwrite(f"{full_imgs_path}/{img_idx:08d}.png", img) + cv2.imwrite(f"{face_imgs_path}/{img_idx:08d}.png", crop_img) + coord_list.append((xmin, ymin, xmin+w, ymin+h)) + +with open(coords_path, 'wb') as f: + pickle.dump(coord_list, f) +os.system(f"cp {checkpoint} {pth_path}") + +# ffmpeg -i test_video.mp4 -i test_audio.pcm -c:v libx264 -c:a aac result_test.mp4 \ No newline at end of file