修改is_speaking状态

main
fanpt 2 weeks ago
parent f65f0b31ac
commit 2b81272a8e

@ -217,7 +217,7 @@ async def human(request):
nerfreals[sessionid].flush_talk() nerfreals[sessionid].flush_talk()
if params['type'] == 'echo': if params['type'] == 'echo':
nerfreals[sessionid].speaking = True nerfreals[sessionid].liv_speaking = True
nerfreals[sessionid].put_msg_txt(params['text']) nerfreals[sessionid].put_msg_txt(params['text'])
elif params['type'] == 'chat': elif params['type'] == 'chat':

@ -50,7 +50,7 @@ def read_imgs(img_list):
frames.append(frame) frames.append(frame)
return frames return frames
def play_audio(quit_event,queue): def play_audio(quit_event,queue):
import pyaudio import pyaudio
p = pyaudio.PyAudio() p = pyaudio.PyAudio()
stream = p.open( stream = p.open(
@ -88,9 +88,11 @@ class BaseReal:
self.tts = TencentTTS(opt,self) self.tts = TencentTTS(opt,self)
elif opt.tts == "doubao": elif opt.tts == "doubao":
self.tts = DoubaoTTS(opt,self) self.tts = DoubaoTTS(opt,self)
self.speaking = False self.speaking = False
self.liv_speaking = False
self.recording = False self.recording = False
self._record_video_pipe = None self._record_video_pipe = None
self._record_audio_pipe = None self._record_audio_pipe = None
@ -106,11 +108,11 @@ class BaseReal:
def put_msg_txt(self,msg,eventpoint=None): def put_msg_txt(self,msg,eventpoint=None):
self.tts.put_msg_txt(msg,eventpoint) self.tts.put_msg_txt(msg,eventpoint)
def put_audio_frame(self,audio_chunk,eventpoint=None): #16khz 20ms pcm def put_audio_frame(self,audio_chunk,eventpoint=None): #16khz 20ms pcm
self.asr.put_audio_frame(audio_chunk,eventpoint) self.asr.put_audio_frame(audio_chunk,eventpoint)
def put_audio_file(self,filebyte): def put_audio_file(self,filebyte):
input_stream = BytesIO(filebyte) input_stream = BytesIO(filebyte)
stream = self.__create_bytes_stream(input_stream) stream = self.__create_bytes_stream(input_stream)
streamlen = stream.shape[0] streamlen = stream.shape[0]
@ -119,7 +121,7 @@ class BaseReal:
self.put_audio_frame(stream[idx:idx+self.chunk]) self.put_audio_frame(stream[idx:idx+self.chunk])
streamlen -= self.chunk streamlen -= self.chunk
idx += self.chunk idx += self.chunk
def __create_bytes_stream(self,byte_stream): def __create_bytes_stream(self,byte_stream):
#byte_stream=BytesIO(buffer) #byte_stream=BytesIO(buffer)
stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64 stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
@ -129,7 +131,7 @@ class BaseReal:
if stream.ndim > 1: if stream.ndim > 1:
logger.info(f'[WARN] audio has {stream.shape[1]} channels, only use the first.') logger.info(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
stream = stream[:, 0] stream = stream[:, 0]
if sample_rate != self.sample_rate and stream.shape[0]>0: if sample_rate != self.sample_rate and stream.shape[0]>0:
logger.info(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.') logger.info(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.')
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate) stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
@ -141,8 +143,8 @@ class BaseReal:
self.asr.flush_talk() self.asr.flush_talk()
def is_speaking(self)->bool: def is_speaking(self)->bool:
return self.speaking return self.liv_speaking
def __loadcustom(self): def __loadcustom(self):
for item in self.opt.customopt: for item in self.opt.customopt:
logger.info(item) logger.info(item)
@ -161,8 +163,15 @@ class BaseReal:
for key in self.custom_index: for key in self.custom_index:
self.custom_index[key]=0 self.custom_index[key]=0
def notify(self,eventpoint): def notify(self, eventpoint):
logger.info("notify:%s",eventpoint) logger.info("notify:%s", eventpoint)
# 使用字典的键访问方式,而不是对象属性方式
if eventpoint['status'] == "start":
self.liv_speaking = True
logger.info("tts start") # 这里可能是笔误改为start更合理
if eventpoint['status'] == "end":
self.liv_speaking = False
logger.info("tts end")
def start_recording(self): def start_recording(self):
"""开始录制视频""" """开始录制视频"""
@ -177,9 +186,9 @@ class BaseReal:
'-s', "{}x{}".format(self.width, self.height), '-s', "{}x{}".format(self.width, self.height),
'-r', str(25), '-r', str(25),
'-i', '-', '-i', '-',
'-pix_fmt', 'yuv420p', '-pix_fmt', 'yuv420p',
'-vcodec', "h264", '-vcodec', "h264",
#'-f' , 'flv', #'-f' , 'flv',
f'temp{self.opt.sessionid}.mp4'] f'temp{self.opt.sessionid}.mp4']
self._record_video_pipe = subprocess.Popen(command, shell=False, stdin=subprocess.PIPE) self._record_video_pipe = subprocess.Popen(command, shell=False, stdin=subprocess.PIPE)
@ -191,7 +200,7 @@ class BaseReal:
'-ar', '16000', '-ar', '16000',
'-i', '-', '-i', '-',
'-acodec', 'aac', '-acodec', 'aac',
#'-f' , 'wav', #'-f' , 'wav',
f'temp{self.opt.sessionid}.aac'] f'temp{self.opt.sessionid}.aac']
self._record_audio_pipe = subprocess.Popen(acommand, shell=False, stdin=subprocess.PIPE) self._record_audio_pipe = subprocess.Popen(acommand, shell=False, stdin=subprocess.PIPE)
@ -199,10 +208,10 @@ class BaseReal:
# self.recordq_video.queue.clear() # self.recordq_video.queue.clear()
# self.recordq_audio.queue.clear() # self.recordq_audio.queue.clear()
# self.container = av.open(path, mode="w") # self.container = av.open(path, mode="w")
# process_thread = Thread(target=self.record_frame, args=()) # process_thread = Thread(target=self.record_frame, args=())
# process_thread.start() # process_thread.start()
def record_video_data(self,image): def record_video_data(self,image):
if self.width == 0: if self.width == 0:
print("image.shape:",image.shape) print("image.shape:",image.shape)
@ -213,14 +222,14 @@ class BaseReal:
def record_audio_data(self,frame): def record_audio_data(self,frame):
if self.recording: if self.recording:
self._record_audio_pipe.stdin.write(frame.tostring()) self._record_audio_pipe.stdin.write(frame.tostring())
# def record_frame(self): # def record_frame(self):
# videostream = self.container.add_stream("libx264", rate=25) # videostream = self.container.add_stream("libx264", rate=25)
# videostream.codec_context.time_base = Fraction(1, 25) # videostream.codec_context.time_base = Fraction(1, 25)
# audiostream = self.container.add_stream("aac") # audiostream = self.container.add_stream("aac")
# audiostream.codec_context.time_base = Fraction(1, 16000) # audiostream.codec_context.time_base = Fraction(1, 16000)
# init = True # init = True
# framenum = 0 # framenum = 0
# while self.recording: # while self.recording:
# try: # try:
# videoframe = self.recordq_video.get(block=True, timeout=1) # videoframe = self.recordq_video.get(block=True, timeout=1)
@ -253,18 +262,18 @@ class BaseReal:
# self.recordq_video.queue.clear() # self.recordq_video.queue.clear()
# self.recordq_audio.queue.clear() # self.recordq_audio.queue.clear()
# print('record thread stop') # print('record thread stop')
def stop_recording(self): def stop_recording(self):
"""停止录制视频""" """停止录制视频"""
if not self.recording: if not self.recording:
return return
self.recording = False self.recording = False
self._record_video_pipe.stdin.close() #wait() self._record_video_pipe.stdin.close() #wait()
self._record_video_pipe.wait() self._record_video_pipe.wait()
self._record_audio_pipe.stdin.close() self._record_audio_pipe.stdin.close()
self._record_audio_pipe.wait() self._record_audio_pipe.wait()
cmd_combine_audio = f"ffmpeg -y -i temp{self.opt.sessionid}.aac -i temp{self.opt.sessionid}.mp4 -c:v copy -c:a copy data/record.mp4" cmd_combine_audio = f"ffmpeg -y -i temp{self.opt.sessionid}.aac -i temp{self.opt.sessionid}.mp4 -c:v copy -c:a copy data/record.mp4"
os.system(cmd_combine_audio) os.system(cmd_combine_audio)
#os.remove(output_path) #os.remove(output_path)
def mirror_index(self,size, index): def mirror_index(self,size, index):
@ -274,8 +283,8 @@ class BaseReal:
if turn % 2 == 0: if turn % 2 == 0:
return res return res
else: else:
return size - res - 1 return size - res - 1
def get_audio_stream(self,audiotype): def get_audio_stream(self,audiotype):
idx = self.custom_audio_index[audiotype] idx = self.custom_audio_index[audiotype]
stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk] stream = self.custom_audio_cycle[audiotype][idx:idx+self.chunk]
@ -283,7 +292,7 @@ class BaseReal:
if self.custom_audio_index[audiotype]>=self.custom_audio_cycle[audiotype].shape[0]: if self.custom_audio_index[audiotype]>=self.custom_audio_cycle[audiotype].shape[0]:
self.curr_state = 1 #当前视频不循环播放,切换到静音状态 self.curr_state = 1 #当前视频不循环播放,切换到静音状态
return stream return stream
def set_custom_state(self,audiotype, reinit=True): def set_custom_state(self,audiotype, reinit=True):
print('set_custom_state:',audiotype) print('set_custom_state:',audiotype)
if self.custom_audio_index.get(audiotype) is None: if self.custom_audio_index.get(audiotype) is None:
@ -295,14 +304,14 @@ class BaseReal:
def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None): def process_frames(self,quit_event,loop=None,audio_track=None,video_track=None):
enable_transition = False # 设置为False禁用过渡效果True启用 enable_transition = False # 设置为False禁用过渡效果True启用
if enable_transition: if enable_transition:
_last_speaking = False _last_speaking = False
_transition_start = time.time() _transition_start = time.time()
_transition_duration = 0.1 # 过渡时间 _transition_duration = 0.1 # 过渡时间
_last_silent_frame = None # 静音帧缓存 _last_silent_frame = None # 静音帧缓存
_last_speaking_frame = None # 说话帧缓存 _last_speaking_frame = None # 说话帧缓存
if self.opt.transport=='virtualcam': if self.opt.transport=='virtualcam':
import pyvirtualcam import pyvirtualcam
vircam = None vircam = None
@ -310,13 +319,13 @@ class BaseReal:
audio_tmp = queue.Queue(maxsize=3000) audio_tmp = queue.Queue(maxsize=3000)
audio_thread = Thread(target=play_audio, args=(quit_event,audio_tmp,), daemon=True, name="pyaudio_stream") audio_thread = Thread(target=play_audio, args=(quit_event,audio_tmp,), daemon=True, name="pyaudio_stream")
audio_thread.start() audio_thread.start()
while not quit_event.is_set(): while not quit_event.is_set():
try: try:
res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1) res_frame,idx,audio_frames = self.res_frame_queue.get(block=True, timeout=1)
except queue.Empty: except queue.Empty:
continue continue
if enable_transition: if enable_transition:
# 检测状态变化 # 检测状态变化
current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0) current_speaking = not (audio_frames[0][1]!=0 and audio_frames[1][1]!=0)
@ -334,7 +343,7 @@ class BaseReal:
self.custom_index[audiotype] += 1 self.custom_index[audiotype] += 1
else: else:
target_frame = self.frame_list_cycle[idx] target_frame = self.frame_list_cycle[idx]
if enable_transition: if enable_transition:
# 说话→静音过渡 # 说话→静音过渡
if time.time() - _transition_start < _transition_duration and _last_speaking_frame is not None: if time.time() - _transition_start < _transition_duration and _last_speaking_frame is not None:
@ -394,8 +403,8 @@ class BaseReal:
if self.opt.transport=='virtualcam': if self.opt.transport=='virtualcam':
audio_thread.join() audio_thread.join()
vircam.close() vircam.close()
logger.info('basereal process_frames thread stop') logger.info('basereal process_frames thread stop')
# def process_custom(self,audiotype:int,idx:int): # def process_custom(self,audiotype:int,idx:int):
# if self.curr_state!=audiotype: #从推理切到口播 # if self.curr_state!=audiotype: #从推理切到口播
# if idx in self.switch_pos: #在卡点位置可以切换 # if idx in self.switch_pos: #在卡点位置可以切换

@ -253,7 +253,8 @@ class SovitsTTS(BaseTTS):
'prompt_text':reftext, 'prompt_text':reftext,
'prompt_lang':language, 'prompt_lang':language,
'media_type':'ogg', 'media_type':'ogg',
'streaming_mode':True 'streaming_mode':True,
"speed_factor":1.2
} }
# req["text"] = text # req["text"] = text
# req["text_language"] = language # req["text_language"] = language
@ -467,7 +468,7 @@ class TencentTTS(BaseTTS):
try: try:
res = requests.post(url, headers=headers, res = requests.post(url, headers=headers,
data=json.dumps(params), stream=True) data=json.dumps(params), stream=True)
end = time.perf_counter() end = time.perf_counter()
logger.info(f"tencent Time to make POST: {end-start}s") logger.info(f"tencent Time to make POST: {end-start}s")

Loading…
Cancel
Save