support Ultralight-Digital-Human

main
lipku 7 months ago
parent 07ed664c78
commit 30c812ef73

@ -7,10 +7,11 @@ Real time interactive streaming digital human realize audio video synchronous
## News
- 2024.12.8 完善多并发,显存不随并发数增加
- 2024.12.21 添加wav2lip、musetalk模型预热解决第一次推理卡顿问题
- 2024.12.21 添加wav2lip、musetalk模型预热解决第一次推理卡顿问题。感谢@heimaojinzhangyz
- 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017
## Features
1. 支持多种数字人模型: ernerf、musetalk、wav2lip
1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human
2. 支持声音克隆
3. 支持数字人说话被打断
4. 支持全身视频拼接
@ -49,6 +50,7 @@ docker run --rm --env CANDIDATE=$CANDIDATE \
registry.cn-hangzhou.aliyuncs.com/ossrs/srs:5 \
objs/srs -c conf/rtc.conf
```
备注:<font color=red>服务端需要开放端口 tcp:8000,8010,1985; udp:8000</font>
### 2.2 启动数字人:
@ -62,7 +64,7 @@ export HF_ENDPOINT=https://hf-mirror.com
```
用浏览器打开http://serverip:8010/rtcpushapi.html, 在文本框输入任意文字,提交。数字人播报该段文字
备注:服务端需要开放端口 tcp:8000,8010,1985; udp:8000
## 3. More Usage
使用说明: <https://livetalking-doc.readthedocs.io/>
@ -81,17 +83,7 @@ docker run --gpus all -it --network=host --rm registry.cn-beijing.aliyuncs.com/c
可以开放任意端口不需要单独运行srs服务.
## 5. 性能分析
1. 帧率
在Tesla T4显卡上测试整体fps为18左右如果去掉音视频编码推流帧率在20左右。用4090显卡可以达到40多帧/秒。
2. 延时
整体延时3s左右
1tts延时1.7s左右目前用的edgetts需要将每句话转完后一次性输入可以优化tts改成流式输入
2wav2vec延时0.4s需要缓存18帧音频做计算
3srs转发延时设置srs服务器减少缓冲延时。具体配置可看 https://ossrs.net/lts/zh-cn/docs/v5/doc/low-latency
## 6. TODO
## 5. TODO
- [x] 添加chatgpt实现数字人对话
- [x] 声音克隆
- [x] 数字人静音时用一段视频代替

@ -488,7 +488,7 @@ if __name__ == '__main__':
print(opt)
model = load_model(opt)
avatar = load_avatar(opt.avatar_id)
warm_up(opt.batch_size,model,160)
warm_up(opt.batch_size,avatar,160)
if opt.transport=='rtmp':
thread_quit = Event()

@ -30,5 +30,5 @@ class LightASR(BaseASR):
self.feat_queue.put(mel_chunks)
self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]
print(f"Processing audio costs {(time.time() - start_time) * 1000}ms")
#print(f"Processing audio costs {(time.time() - start_time) * 1000}ms")

@ -61,39 +61,39 @@ device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} for inference.'.format(device))
def load_model(opt):
audio_processor = Audio2Feature()
model = Model(6, 'hubert').to(device) # 假设Model是你自定义的类
model.load_state_dict(torch.load('./models/ultralight.pth'))
model.eval()
return model,audio_processor
return audio_processor
def load_avatar(avatar_id):
avatar_path = f"./data/avatars/{avatar_id}"
full_imgs_path = f"{avatar_path}/full_body_img"
land_marks_path = f"{avatar_path}/landmarks"
full_imgs_path = f"{avatar_path}/full_imgs"
face_imgs_path = f"{avatar_path}/face_imgs"
coords_path = f"{avatar_path}/coords.pkl"
model = Model(6, 'hubert').to(device) # 假设Model是你自定义的类
model.load_state_dict(torch.load(f"{avatar_path}/ultralight.pth"))
with open(coords_path, 'rb') as f:
coord_list_cycle = pickle.load(f)
input_img_list = glob.glob(os.path.join(full_imgs_path, '*.[jpJP][pnPN]*[gG]'))
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
frame_list_cycle = read_imgs(input_img_list)
#self.imagecache = ImgCache(len(self.coord_list_cycle),self.full_imgs_path,1000)
land_marks_list = glob.glob(os.path.join(land_marks_path, '*.lms'))
land_marks_list = sorted(land_marks_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
lms_list_cycle = read_lms(land_marks_list)
lms_list_cycle = np.array(lms_list_cycle, dtype=np.int32)
return frame_list_cycle,lms_list_cycle
input_face_list = glob.glob(os.path.join(face_imgs_path, '*.[jpJP][pnPN]*[gG]'))
input_face_list = sorted(input_face_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
face_list_cycle = read_imgs(input_face_list)
return model.eval(),frame_list_cycle,face_list_cycle,coord_list_cycle
@torch.no_grad()
def warm_up(batch_size,model,modelres):
# ?~D?~C??~G??~U?
def warm_up(batch_size,avatar,modelres):
print('warmup model...')
model1, audio_processor = model
model,_,_,_ = avatar
img_batch = torch.ones(batch_size, 6, modelres, modelres).to(device)
mel_batch = torch.ones(batch_size, 32, 32, 32).to(device)
model1(img_batch, mel_batch)
model(img_batch, mel_batch)
def read_imgs(img_list):
frames = []
@ -147,8 +147,8 @@ def __mirror_index(size, index):
return size - res - 1
def inference(quit_event, batch_size, frame_list_cycle, lms_list_cycle, audio_feat_queue, audio_out_queue, res_frame_queue, model):
length = len(lms_list_cycle)
def inference(quit_event, batch_size, face_list_cycle, audio_feat_queue, audio_out_queue, res_frame_queue, model):
length = len(face_list_cycle)
index = 0
count = 0
counttime = 0
@ -177,16 +177,11 @@ def inference(quit_event, batch_size, frame_list_cycle, lms_list_cycle, audio_fe
for i in range(batch_size):
idx = __mirror_index(length, index + i)
face = frame_list_cycle[idx]
lms = lms_list_cycle[idx]
xmin, ymin = lms[1][0], lms[52][1]
xmax = lms[31][0]
width = xmax - xmin
ymax = ymin + width
crop_img = face[ymin:ymax, xmin:xmax]
#face = face_list_cycle[idx]
crop_img = face_list_cycle[idx] #face[ymin:ymax, xmin:xmax]
# h, w = crop_img.shape[:2]
crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA)
crop_img_ori = crop_img.copy()
#crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA)
#crop_img_ori = crop_img.copy()
img_real_ex = crop_img[4:164, 4:164].copy()
img_real_ex_ori = img_real_ex.copy()
img_masked = cv2.rectangle(img_real_ex_ori,(5,5,150,145),(0,0,0),-1)
@ -243,8 +238,8 @@ class LightReal(BaseReal):
self.idx = 0
self.res_frame_queue = Queue(self.batch_size*2) #mp.Queue
#self.__loadavatar()
self.model,audio_processor = model
self.frame_list_cycle,self.lms_list_cycle = avatar
audio_processor = model
self.model,self.frame_list_cycle,self.face_list_cycle,self.coord_list_cycle = avatar
self.asr = LightASR(opt,self,audio_processor)
self.asr.warm_up()
@ -277,22 +272,19 @@ class LightReal(BaseReal):
#combine_frame = self.imagecache.get_img(idx)
else:
self.speaking = True
lms = self.lms_list_cycle[idx]
bbox = self.coord_list_cycle[idx]
combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
xmin = lms[1][0]
ymin = lms[52][1]
xmax = lms[31][0]
width = xmax - xmin
ymax = ymin + width
crop_img = combine_frame[ymin:ymax, xmin:xmax]
h, w = crop_img.shape[:2]
crop_img_ori = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA).copy()
#combine_frame = copy.deepcopy(self.imagecache.get_img(idx))
res_frame = np.array(res_frame, dtype=np.uint8)
crop_img_ori[4:164, 4:164] = res_frame
crop_img_ori = cv2.resize(crop_img_ori, (w, h))
combine_frame[ymin:ymax, xmin:xmax] = crop_img_ori
x1, y1, x2, y2 = bbox
crop_img = self.face_list_cycle[idx]
crop_img_ori = crop_img.copy()
#res_frame = np.array(res_frame, dtype=np.uint8)
try:
crop_img_ori[4:164, 4:164] = res_frame.astype(np.uint8)
crop_img_ori = cv2.resize(crop_img_ori, (x2-x1,y2-y1))
except:
continue
combine_frame[y1:y2, x1:x2] = crop_img_ori
#print('blending time:',time.perf_counter()-t)
new_frame = VideoFrame.from_ndarray(combine_frame, format="bgr24")
@ -319,7 +311,7 @@ class LightReal(BaseReal):
self.init_customindex()
process_thread = Thread(target=self.process_frames, args=(quit_event,loop,audio_track,video_track))
process_thread.start()
Thread(target=inference, args=(quit_event,self.batch_size,self.frame_list_cycle,self.lms_list_cycle,self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue,
Thread(target=inference, args=(quit_event,self.batch_size,self.face_list_cycle,self.asr.feat_queue,self.asr.output_queue,self.res_frame_queue,
self.model,)).start() #mp.Process

@ -6,8 +6,8 @@ import numpy as np
class Audio2Feature():
def __init__(self):
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.processor = Wav2Vec2Processor.from_pretrained('./models/hubert-large-ls960-ft')
self.model = HubertModel.from_pretrained('./models/hubert-large-ls960-ft').to(self.device)
self.processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
self.model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft").to(self.device)
@torch.no_grad()

@ -0,0 +1,116 @@
import argparse
import os
import cv2
import torch
import numpy as np
import torch.nn as nn
from torch import optim
from tqdm import tqdm
from torch.utils.data import DataLoader
from unet import Model
import pickle
# from unet2 import Model
# from unet_att import Model
import time
def osmakedirs(path_list):
for path in path_list:
os.makedirs(path) if not os.path.exists(path) else None
parser = argparse.ArgumentParser(description='Train',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset', type=str, default="")
#parser.add_argument('--save_path', type=str, default="") # end with .mp4 please
parser.add_argument('--checkpoint', type=str, default="")
parser.add_argument('--avatar_id', default='ultralight_avatar1', type=str)
args = parser.parse_args()
checkpoint = args.checkpoint
dataset_dir = args.dataset
img_dir = os.path.join(dataset_dir, "full_body_img/")
lms_dir = os.path.join(dataset_dir, "landmarks/")
avatar_path = f"./results/avatars/{args.avatar_id}"
full_imgs_path = f"{avatar_path}/full_imgs"
face_imgs_path = f"{avatar_path}/face_imgs"
coords_path = f"{avatar_path}/coords.pkl"
pth_path = f"{avatar_path}/ultralight.pth"
osmakedirs([avatar_path,full_imgs_path,face_imgs_path])
len_img = len(os.listdir(img_dir)) - 1
exm_img = cv2.imread(img_dir+"0.jpg")
h, w = exm_img.shape[:2]
step_stride = 0
img_idx = 0
coord_list = []
net = Model(6, 'hubert').cuda()
net.load_state_dict(torch.load(checkpoint))
net.eval()
for i in range(len_img):
if img_idx>len_img - 1:
step_stride = -1
if img_idx<1:
step_stride = 1
img_idx += step_stride
img_path = img_dir + str(img_idx)+'.jpg'
lms_path = lms_dir + str(img_idx)+'.lms'
img = cv2.imread(img_path)
lms_list = []
with open(lms_path, "r") as f:
lines = f.read().splitlines()
for line in lines:
arr = line.split(" ")
arr = np.array(arr, dtype=np.float32)
lms_list.append(arr)
lms = np.array(lms_list, dtype=np.int32)
xmin = lms[1][0]
ymin = lms[52][1]
xmax = lms[31][0]
width = xmax - xmin
ymax = ymin + width
crop_img = img[ymin:ymax, xmin:xmax]
h, w = crop_img.shape[:2]
crop_img = cv2.resize(crop_img, (168, 168), cv2.INTER_AREA)
crop_img_ori = crop_img.copy()
img_real_ex = crop_img[4:164, 4:164].copy()
img_real_ex_ori = img_real_ex.copy()
img_masked = cv2.rectangle(img_real_ex_ori,(5,5,150,145),(0,0,0),-1)
img_masked = img_masked.transpose(2,0,1).astype(np.float32)
img_real_ex = img_real_ex.transpose(2,0,1).astype(np.float32)
img_real_ex_T = torch.from_numpy(img_real_ex / 255.0)
img_masked_T = torch.from_numpy(img_masked / 255.0)
img_concat_T = torch.cat([img_real_ex_T, img_masked_T], axis=0)[None]
audio_feat = torch.zeros(1, 32, 32, 32)
#print('audio_feat:',audio_feat.shape)
audio_feat = audio_feat.cuda()
img_concat_T = img_concat_T.cuda()
#print('img_concat_T:',img_concat_T.shape)
with torch.no_grad():
pred = net(img_concat_T, audio_feat)[0]
pred = pred.cpu().numpy().transpose(1,2,0)*255
pred = np.array(pred, dtype=np.uint8)
crop_img_ori[4:164, 4:164] = pred
crop_img_ori = cv2.resize(crop_img_ori, (w, h))
img[ymin:ymax, xmin:xmax] = crop_img_ori
cv2.putText(img, "LiveTalking", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.3, (128,128,128), 1)
cv2.imwrite(f"{full_imgs_path}/{img_idx:08d}.png", img)
cv2.imwrite(f"{face_imgs_path}/{img_idx:08d}.png", crop_img)
coord_list.append((xmin, ymin, xmin+w, ymin+h))
with open(coords_path, 'wb') as f:
pickle.dump(coord_list, f)
os.system(f"cp {checkpoint} {pth_path}")
# ffmpeg -i test_video.mp4 -i test_audio.pcm -c:v libx264 -c:a aac result_test.mp4
Loading…
Cancel
Save