add wav2lip customvideo

main
lipku 12 months ago
parent 0c63e9a11b
commit 391512f68c

@ -150,6 +150,19 @@ async def human(request):
), ),
) )
async def set_audiotype(request):
params = await request.json()
sessionid = params.get('sessionid',0)
nerfreals[sessionid].set_curr_state(params['audiotype'],params['reinit'])
return web.Response(
content_type="application/json",
text=json.dumps(
{"code": 0, "data":"ok"}
),
)
async def on_shutdown(app): async def on_shutdown(app):
# close peer connections # close peer connections
coros = [pc.close() for pc in pcs] coros = [pc.close() for pc in pcs]
@ -307,6 +320,8 @@ if __name__ == '__main__':
parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img') parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
parser.add_argument('--customvideo_imgnum', type=int, default=1) parser.add_argument('--customvideo_imgnum', type=int, default=1)
parser.add_argument('--customvideo_config', type=str, default='')
parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
parser.add_argument('--REF_FILE', type=str, default=None) parser.add_argument('--REF_FILE', type=str, default=None)
parser.add_argument('--REF_TEXT', type=str, default=None) parser.add_argument('--REF_TEXT', type=str, default=None)
@ -325,6 +340,10 @@ if __name__ == '__main__':
opt = parser.parse_args() opt = parser.parse_args()
#app.config.from_object(opt) #app.config.from_object(opt)
#print(app.config) #print(app.config)
opt.customopt = []
if opt.customvideo_config!='':
with open(opt.customvideo_config,'r') as file:
opt.customopt = json.load(file)
if opt.model == 'ernerf': if opt.model == 'ernerf':
from ernerf.nerf_triplane.provider import NeRFDataset_Test from ernerf.nerf_triplane.provider import NeRFDataset_Test
@ -402,6 +421,7 @@ if __name__ == '__main__':
appasync.on_shutdown.append(on_shutdown) appasync.on_shutdown.append(on_shutdown)
appasync.router.add_post("/offer", offer) appasync.router.add_post("/offer", offer)
appasync.router.add_post("/human", human) appasync.router.add_post("/human", human)
appasync.router.add_post("/set_audiotype", set_audiotype)
appasync.router.add_static('/',path='web') appasync.router.add_static('/',path='web')
# Configure default CORS settings. # Configure default CORS settings.

@ -7,8 +7,9 @@ import multiprocessing as mp
class BaseASR: class BaseASR:
def __init__(self, opt): def __init__(self, opt, parent=None):
self.opt = opt self.opt = opt
self.parent = parent
self.fps = opt.fps # 20 ms per frame self.fps = opt.fps # 20 ms per frame
self.sample_rate = 16000 self.sample_rate = 16000
@ -38,8 +39,12 @@ class BaseASR:
type = 0 type = 0
#print(f'[INFO] get frame {frame.shape}') #print(f'[INFO] get frame {frame.shape}')
except queue.Empty: except queue.Empty:
frame = np.zeros(self.chunk, dtype=np.float32) if self.parent and self.parent.curr_state>1: #播放自定义音频
type = 1 frame = self.parent.get_audio_stream(self.parent.curr_state)
type = self.parent.curr_state
else:
frame = np.zeros(self.chunk, dtype=np.float32)
type = 1
return frame,type return frame,type

@ -0,0 +1,81 @@
import math
import torch
import numpy as np
import os
import time
import cv2
import glob
import pickle
import copy
import queue
from queue import Queue
from threading import Thread, Event
from io import BytesIO
import soundfile as sf
from tqdm import tqdm
def read_imgs(img_list):
frames = []
print('reading images...')
for img_path in tqdm(img_list):
frame = cv2.imread(img_path)
frames.append(frame)
return frames
class BaseReal:
def __init__(self, opt):
self.opt = opt
self.sample_rate = 16000
self.chunk = self.sample_rate // opt.fps # 320 samples per chunk (20ms * 16000 / 1000)
self.curr_state=0
self.custom_img_cycle = {}
self.custom_audio_cycle = {}
self.custom_audio_index = {}
self.custom_index = {}
self.custom_opt = {}
self.__loadcustom()
def __loadcustom(self):
for item in self.opt.customopt:
print(item)
input_img_list = glob.glob(os.path.join(item['imgpath'], '*.[jpJP][pnPN]*[gG]'))
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
self.custom_img_cycle[item['audiotype']] = read_imgs(input_img_list)
self.custom_audio_cycle[item['audiotype']], sample_rate = sf.read(item['audiopath'], dtype='float32')
self.custom_audio_index[item['audiotype']] = 0
self.custom_index[item['audiotype']] = 0
self.custom_opt[item['audiotype']] = item
def mirror_index(self,size, index):
#size = len(self.coord_list_cycle)
turn = index // size
res = index % size
if turn % 2 == 0:
return res
else:
return size - res - 1
def get_audio_stream(self,audiotype):
idx = self.custom_audio_index[audiotype]
stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk]
self.custom_audio_index[audiotype] += self.chunk
if self.custom_audio_index[audiotype]>=stream.shape[0]:
self.curr_state = 1 #当前视频不循环播放,切换到静音状态
return stream
def set_curr_state(self,audiotype, reinit):
self.curr_state = audiotype
if reinit:
self.custom_audio_index[audiotype] = 0
self.custom_index[audiotype] = 0
# def process_custom(self,audiotype:int,idx:int):
# if self.curr_state!=audiotype: #从推理切到口播
# if idx in self.switch_pos: #在卡点位置可以切换
# self.curr_state=audiotype
# self.custom_index=0
# else:
# self.custom_index+=1

@ -0,0 +1,7 @@
[
{
"audiotype":2,
"imgpath":"data/customvideo/image",
"audiopath":"data/customvideo/audio.wav"
}
]

@ -23,8 +23,8 @@ from ttsreal import EdgeTTS,VoitsTTS,XTTS
from lipasr import LipASR from lipasr import LipASR
import asyncio import asyncio
from av import AudioFrame, VideoFrame from av import AudioFrame, VideoFrame
from wav2lip.models import Wav2Lip from wav2lip.models import Wav2Lip
from basereal import BaseReal
from tqdm import tqdm from tqdm import tqdm
@ -143,9 +143,10 @@ def inference(render_event,batch_size,face_imgs_path,audio_feat_queue,audio_out_
print('musereal inference processor stop') print('musereal inference processor stop')
@torch.no_grad() @torch.no_grad()
class LipReal: class LipReal(BaseReal):
def __init__(self, opt): def __init__(self, opt):
self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters. super().__init__(opt)
#self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
self.W = opt.W self.W = opt.W
self.H = opt.H self.H = opt.H
@ -163,7 +164,7 @@ class LipReal:
#self.__loadmodels() #self.__loadmodels()
self.__loadavatar() self.__loadavatar()
self.asr = LipASR(opt) self.asr = LipASR(opt,self)
self.asr.warm_up() self.asr.warm_up()
if opt.tts == "edgetts": if opt.tts == "edgetts":
self.tts = EdgeTTS(opt,self) self.tts = EdgeTTS(opt,self)
@ -213,8 +214,16 @@ class LipReal:
res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1) res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
except queue.Empty: except queue.Empty:
continue continue
if audio_frames[0][1]==1 and audio_frames[1][1]==1: #全为静音数据只需要取fullimg if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据只需要取fullimg
combine_frame = self.frame_list_cycle[idx] audiotype = audio_frames[0][1]
if self.custom_index.get(audiotype) is not None: #有自定义视频
mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
combine_frame = self.custom_img_cycle[audiotype][mirindex]
self.custom_index[audiotype] += 1
# if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]):
# self.curr_state = 1 #当前视频不循环播放,切换到静音状态
else:
combine_frame = self.frame_list_cycle[idx]
else: else:
bbox = self.coord_list_cycle[idx] bbox = self.coord_list_cycle[idx]
combine_frame = copy.deepcopy(self.frame_list_cycle[idx]) combine_frame = copy.deepcopy(self.frame_list_cycle[idx])

@ -15,7 +15,7 @@ class VllmGPT:
self.__URL = "http://{}:{}/v1/completions".format(self.host, self.port) self.__URL = "http://{}:{}/v1/completions".format(self.host, self.port)
self.__URL2 = "http://{}:{}/v1/chat/completions".format(self.host, self.port) self.__URL2 = "http://{}:{}/v1/chat/completions".format(self.host, self.port)
def question(self,cont): def chat(self,cont):
chat_list = [] chat_list = []
# contentdb = content_db.new_instance() # contentdb = content_db.new_instance()
# list = contentdb.get_list('all','desc',11) # list = contentdb.get_list('all','desc',11)
@ -77,5 +77,5 @@ class VllmGPT:
if __name__ == "__main__": if __name__ == "__main__":
vllm = VllmGPT('192.168.1.3','8101') vllm = VllmGPT('192.168.1.3','8101')
req = vllm.question("你叫什么名字啊今年多大了") req = vllm.chat("你叫什么名字啊今年多大了")
print(req) print(req)

@ -0,0 +1,113 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>WebRTC webcam</title>
<style>
button {
padding: 8px 16px;
}
video {
width: 100%;
}
.option {
margin-bottom: 8px;
}
#media {
max-width: 1280px;
}
</style>
</head>
<body>
<div class="option">
<input id="use-stun" type="checkbox"/>
<label for="use-stun">Use STUN server</label>
</div>
<button id="start" onclick="start()">Start</button>
<button id="stop" style="display: none" onclick="stop()">Stop</button>
<input type="hidden" id="sessionid" value="0">
<form class="form-inline" id="echo-form">
<div class="form-group">
<p>input text</p>
<textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
</div>
<button type="submit" class="btn btn-default">Send</button>
</form>
<div id="media">
<h2>Media</h2>
<audio id="audio" autoplay="true"></audio>
<video id="video" style="width:600px;" autoplay="true" playsinline="true"></video>
</div>
<button id="custom" onclick="custom()">切换视频</button>
<input type="text" id="audiotype" value="0">
<script src="client.js"></script>
<script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
<script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
</body>
<script type="text/javascript" charset="utf-8">
$(document).ready(function() {
// var host = window.location.hostname
// var ws = new WebSocket("ws://"+host+":8000/humanecho");
// //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
// ws.onopen = function() {
// console.log('Connected');
// };
// ws.onmessage = function(e) {
// console.log('Received: ' + e.data);
// data = e
// var vid = JSON.parse(data.data);
// console.log(typeof(vid),vid)
// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
// };
// ws.onclose = function(e) {
// console.log('Closed');
// };
$('#echo-form').on('submit', function(e) {
e.preventDefault();
var message = $('#message').val();
console.log('Sending: ' + message);
console.log('sessionid: ',document.getElementById('sessionid').value);
fetch('/human', {
body: JSON.stringify({
text: message,
type: 'echo',
interrupt: true,
sessionid:parseInt(document.getElementById('sessionid').value),
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
});
//ws.send(message);
$('#message').val('');
});
function custom() {
fetch('/set_audiotype', {
body: JSON.stringify({
audiotype: parseInt(document.getElementById('audiotype').value),
reinit: false,
sessionid:parseInt(document.getElementById('sessionid').value),
}),
headers: {
'Content-Type': 'application/json'
},
method: 'POST'
});
}
});
</script>
</html>

@ -30,7 +30,7 @@
</div> </div>
<button id="start" onclick="start()">Start</button> <button id="start" onclick="start()">Start</button>
<button id="stop" style="display: none" onclick="stop()">Stop</button> <button id="stop" style="display: none" onclick="stop()">Stop</button>
<input type="hidden" id="sessionid" value="1234"> <input type="hidden" id="sessionid" value="0">
<form class="form-inline" id="echo-form"> <form class="form-inline" id="echo-form">
<div class="form-group"> <div class="form-group">
<p>input text</p> <p>input text</p>

Loading…
Cancel
Save