add wav2lip customvideo

12 months ago · 391512f68c
parent 0c63e9a11b
commit 391512f68c
8 changed files with 248 additions and 13 deletions
--- a/app.py
+++ b/app.py
@ -150,6 +150,19 @@ async def human(request):
        ),
    )
 async def set_audiotype(request):
    params = await request.json()
    sessionid = params.get('sessionid',0)    
    nerfreals[sessionid].set_curr_state(params['audiotype'],params['reinit'])
    return web.Response(
        content_type="application/json",
        text=json.dumps(
            {"code": 0, "data":"ok"}
        ),
    )
 async def on_shutdown(app):
    # close peer connections
    coros = [pc.close() for pc in pcs]
@ -307,6 +320,8 @@ if __name__ == '__main__':
    parser.add_argument('--customvideo_img', type=str, default='data/customvideo/img')
    parser.add_argument('--customvideo_imgnum', type=int, default=1)
    parser.add_argument('--customvideo_config', type=str, default='')
    parser.add_argument('--tts', type=str, default='edgetts') #xtts gpt-sovits
    parser.add_argument('--REF_FILE', type=str, default=None)
    parser.add_argument('--REF_TEXT', type=str, default=None)
@ -325,6 +340,10 @@ if __name__ == '__main__':
    opt = parser.parse_args()
    #app.config.from_object(opt)
    #print(app.config)
    opt.customopt = []
    if opt.customvideo_config!='':
        with open(opt.customvideo_config,'r') as file:
            opt.customopt = json.load(file)
    if opt.model == 'ernerf':
        from ernerf.nerf_triplane.provider import NeRFDataset_Test
@ -402,6 +421,7 @@ if __name__ == '__main__':
    appasync.on_shutdown.append(on_shutdown)
    appasync.router.add_post("/offer", offer)
    appasync.router.add_post("/human", human)
    appasync.router.add_post("/set_audiotype", set_audiotype)
    appasync.router.add_static('/',path='web')
    # Configure default CORS settings.
--- a/baseasr.py
+++ b/baseasr.py
@ -7,8 +7,9 @@ import multiprocessing as mp
 class BaseASR:
-    def __init__(self, opt):
+    def __init__(self, opt, parent=None):
        self.opt = opt
        self.parent = parent
        self.fps = opt.fps # 20 ms per frame
        self.sample_rate = 16000
@ -38,8 +39,12 @@ class BaseASR:
            type = 0
            #print(f'[INFO] get frame {frame.shape}')
        except queue.Empty:
-            frame = np.zeros(self.chunk, dtype=np.float32)
+            if self.parent and self.parent.curr_state>1: #播放自定义音频
-            type = 1
+                frame = self.parent.get_audio_stream(self.parent.curr_state)
                type = self.parent.curr_state
            else:
                frame = np.zeros(self.chunk, dtype=np.float32)
                type = 1
        return frame,type 
--- a/basereal.py
+++ b/basereal.py
@ -0,0 +1,81 @@
 import math
 import torch
 import numpy as np
 import os
 import time
 import cv2
 import glob
 import pickle
 import copy
 import queue
 from queue import Queue
 from threading import Thread, Event
 from io import BytesIO
 import soundfile as sf
 from tqdm import tqdm
 def read_imgs(img_list):
    frames = []
    print('reading images...')
    for img_path in tqdm(img_list):
        frame = cv2.imread(img_path)
        frames.append(frame)
    return frames
 class BaseReal:
    def __init__(self, opt):
        self.opt = opt
        self.sample_rate = 16000
        self.chunk = self.sample_rate // opt.fps # 320 samples per chunk (20ms * 16000 / 1000)
        self.curr_state=0
        self.custom_img_cycle = {}
        self.custom_audio_cycle = {}
        self.custom_audio_index = {}
        self.custom_index = {}
        self.custom_opt = {}
        self.__loadcustom()
    def __loadcustom(self):
        for item in self.opt.customopt:
            print(item)
            input_img_list = glob.glob(os.path.join(item['imgpath'], '*.[jpJP][pnPN]*[gG]'))
            input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
            self.custom_img_cycle[item['audiotype']] = read_imgs(input_img_list)
            self.custom_audio_cycle[item['audiotype']], sample_rate = sf.read(item['audiopath'], dtype='float32')
            self.custom_audio_index[item['audiotype']] = 0
            self.custom_index[item['audiotype']] = 0
            self.custom_opt[item['audiotype']] = item
    def mirror_index(self,size, index):
        #size = len(self.coord_list_cycle)
        turn = index // size
        res = index % size
        if turn % 2 == 0:
            return res
        else:
            return size - res - 1 
    def get_audio_stream(self,audiotype):
        idx = self.custom_audio_index[audiotype]
        stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk]
        self.custom_audio_index[audiotype] += self.chunk
        if self.custom_audio_index[audiotype]>=stream.shape[0]:
            self.curr_state = 1  #当前视频不循环播放，切换到静音状态
        return stream
    def set_curr_state(self,audiotype, reinit):
        self.curr_state = audiotype
        if reinit:
            self.custom_audio_index[audiotype] = 0
            self.custom_index[audiotype] = 0
    # def process_custom(self,audiotype:int,idx:int):
    #     if self.curr_state!=audiotype: #从推理切到口播
    #         if idx in self.switch_pos:  #在卡点位置可以切换
    #             self.curr_state=audiotype
    #             self.custom_index=0
    #     else:
    #         self.custom_index+=1
--- a/data/custom_config.json
+++ b/data/custom_config.json
@ -0,0 +1,7 @@
 [
   {
        "audiotype":2, 
        "imgpath":"data/customvideo/image", 
        "audiopath":"data/customvideo/audio.wav"
    }
 ]
--- a/lipreal.py
+++ b/lipreal.py
@ -23,8 +23,8 @@ from ttsreal import EdgeTTS,VoitsTTS,XTTS
 from lipasr import LipASR
 import asyncio
 from av import AudioFrame, VideoFrame
 from wav2lip.models import Wav2Lip
 from basereal import BaseReal
 from tqdm import tqdm
@ -143,9 +143,10 @@ def inference(render_event,batch_size,face_imgs_path,audio_feat_queue,audio_out_
    print('musereal inference processor stop')
@torch.no_grad()
-class LipReal:
+class LipReal(BaseReal):
    def __init__(self, opt):
-        self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
+        super().__init__(opt)
        #self.opt = opt # shared with the trainer's opt to support in-place modification of rendering parameters.
        self.W = opt.W
        self.H = opt.H
@ -163,7 +164,7 @@ class LipReal:
        #self.__loadmodels()
        self.__loadavatar()
-        self.asr = LipASR(opt)
+        self.asr = LipASR(opt,self)
        self.asr.warm_up()
        if opt.tts == "edgetts":
            self.tts = EdgeTTS(opt,self)
@ -213,8 +214,16 @@ class LipReal:
                res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
            except queue.Empty:
                continue
-            if audio_frames[0][1]==1 and audio_frames[1][1]==1: #全为静音数据，只需要取fullimg
+            if audio_frames[0][1]!=0 and audio_frames[1][1]!=0: #全为静音数据，只需要取fullimg
-                combine_frame = self.frame_list_cycle[idx]
+                audiotype = audio_frames[0][1]
                if self.custom_index.get(audiotype) is not None: #有自定义视频
                    mirindex = self.mirror_index(len(self.custom_img_cycle[audiotype]),self.custom_index[audiotype])
                    combine_frame = self.custom_img_cycle[audiotype][mirindex]
                    self.custom_index[audiotype] += 1
                    # if not self.custom_opt[audiotype].loop and self.custom_index[audiotype]>=len(self.custom_img_cycle[audiotype]):
                    #     self.curr_state = 1  #当前视频不循环播放，切换到静音状态
                else:
                    combine_frame = self.frame_list_cycle[idx]
            else:
                bbox = self.coord_list_cycle[idx]
                combine_frame = copy.deepcopy(self.frame_list_cycle[idx])
--- a/llm/VllmGPT.py
+++ b/llm/VllmGPT.py
@ -15,7 +15,7 @@ class VllmGPT:
        self.__URL = "http://{}:{}/v1/completions".format(self.host, self.port)
        self.__URL2 = "http://{}:{}/v1/chat/completions".format(self.host, self.port)
-    def question(self,cont):
+    def chat(self,cont):
        chat_list = []
        # contentdb = content_db.new_instance()
        # list = contentdb.get_list('all','desc',11)
@ -77,5 +77,5 @@ class VllmGPT:
 if __name__ == "__main__":
    vllm = VllmGPT('192.168.1.3','8101')
-    req = vllm.question("你叫什么名字啊今年多大了")
+    req = vllm.chat("你叫什么名字啊今年多大了")
    print(req)
--- a/web/webrtcapi-custom.html
+++ b/web/webrtcapi-custom.html
@ -0,0 +1,113 @@
 <!DOCTYPE html>
 <html>
 <head>
    <meta charset="UTF-8"/>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>WebRTC webcam</title>
    <style>
    button {
        padding: 8px 16px;
    }
    video {
        width: 100%;
    }
    .option {
        margin-bottom: 8px;
    }
    #media {
        max-width: 1280px;
    }
    </style>
 </head>
 <body>
 <div class="option">
    <input id="use-stun" type="checkbox"/>
    <label for="use-stun">Use STUN server</label>
 </div>
 <button id="start" onclick="start()">Start</button>
 <button id="stop" style="display: none" onclick="stop()">Stop</button>
 <input type="hidden" id="sessionid" value="0">
 <form class="form-inline" id="echo-form">
    <div class="form-group">
      <p>input text</p>
      <textarea cols="2" rows="3" style="width:600px;height:50px;" class="form-control" id="message">test</textarea>
    </div>
    <button type="submit" class="btn btn-default">Send</button>
  </form>
 <div id="media">
    <h2>Media</h2>
    <audio id="audio" autoplay="true"></audio>
    <video id="video" style="width:600px;" autoplay="true" playsinline="true"></video>
 </div>
 <button id="custom" onclick="custom()">切换视频</button>
 <input type="text" id="audiotype" value="0">
 <script src="client.js"></script>
 <script type="text/javascript" src="http://cdn.sockjs.org/sockjs-0.3.4.js"></script>
 <script type="text/javascript" src="https://ajax.aspnetcdn.com/ajax/jquery/jquery-2.1.1.min.js"></script>
 </body>
 <script type="text/javascript" charset="utf-8">
 	$(document).ready(function() {
 	  // var host = window.location.hostname
 	  // var ws = new WebSocket("ws://"+host+":8000/humanecho");
 	  // //document.getElementsByTagName("video")[0].setAttribute("src", aa["video"]);
 	  // ws.onopen = function() {
 		// console.log('Connected');
 	  // };
 	  // ws.onmessage = function(e) {
 		// console.log('Received: ' + e.data);
 		// data = e
 		// var vid = JSON.parse(data.data); 
 		// console.log(typeof(vid),vid)
 		// //document.getElementsByTagName("video")[0].setAttribute("src", vid["video"]);
 	  // };
 	  // ws.onclose = function(e) {
 		// console.log('Closed');
 	  // };
 	  $('#echo-form').on('submit', function(e) {
      e.preventDefault();
      var message = $('#message').val();
      console.log('Sending: ' + message);
      console.log('sessionid: ',document.getElementById('sessionid').value);
      fetch('/human', {
            body: JSON.stringify({
                text: message,
                type: 'echo',
                interrupt: true,
                sessionid:parseInt(document.getElementById('sessionid').value),
            }),
            headers: {
                'Content-Type': 'application/json'
            },
            method: 'POST'
      });
      //ws.send(message);
      $('#message').val('');
 	  });
    function custom() {
      fetch('/set_audiotype', {
            body: JSON.stringify({
                audiotype: parseInt(document.getElementById('audiotype').value),
                reinit: false,
                sessionid:parseInt(document.getElementById('sessionid').value),
            }),
            headers: {
                'Content-Type': 'application/json'
            },
            method: 'POST'
      });
    }
 	});
 </script>
 </html>
--- a/web/webrtcapi.html
+++ b/web/webrtcapi.html
@ -30,7 +30,7 @@
 </div>
 <button id="start" onclick="start()">Start</button>
 <button id="stop" style="display: none" onclick="stop()">Stop</button>
-<input type="hidden" id="sessionid" value="1234">
+<input type="hidden" id="sessionid" value="0">
 <form class="form-inline" id="echo-form">
    <div class="form-group">
      <p>input text</p>