diff --git a/README.md b/README.md index 4812355..48fcddd 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Real time interactive streaming digital human, realize audio video synchronous - 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017 - 2025.2.7 添加fish-speech tts - 2025.2.21 添加wav2lip256开源模型 感谢@不蠢不蠢 +- 2025.3.2 添加腾讯语音合成服务 ## Features 1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human diff --git a/basereal.py b/basereal.py index 0f66ad6..edb722c 100644 --- a/basereal.py +++ b/basereal.py @@ -35,7 +35,7 @@ import soundfile as sf import av from fractions import Fraction -from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS,FishTTS +from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS,FishTTS,TencentTTS from logger import logger from tqdm import tqdm @@ -64,6 +64,8 @@ class BaseReal: self.tts = CosyVoiceTTS(opt,self) elif opt.tts == "fishtts": self.tts = FishTTS(opt,self) + elif opt.tts == "tencent": + self.tts = TencentTTS(opt,self) self.speaking = False diff --git a/ttsreal.py b/ttsreal.py index e0b35a3..fa98a7b 100644 --- a/ttsreal.py +++ b/ttsreal.py @@ -22,6 +22,13 @@ import resampy import asyncio import edge_tts +import os +import hmac +import hashlib +import base64 +import json +import uuid + from typing import Iterator import requests @@ -351,7 +358,7 @@ class CosyVoiceTTS(BaseTTS): first = True - for chunk in res.iter_content(chunk_size=8820): # 882 22.05K*20ms*2 + for chunk in res.iter_content(chunk_size=9600): # 960 24K*20ms*2 if first: end = time.perf_counter() logger.info(f"cosy_voice Time to first chunk: {end-start}s") @@ -367,7 +374,130 @@ class CosyVoiceTTS(BaseTTS): for chunk in audio_stream: if chunk is not None and len(chunk)>0: stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767 - stream = resampy.resample(x=stream, sr_orig=22050, sr_new=self.sample_rate) + stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate) + #byte_stream=BytesIO(buffer) + #stream = self.__create_bytes_stream(byte_stream) + streamlen = stream.shape[0] + idx=0 + while streamlen >= self.chunk: + eventpoint=None + if first: + eventpoint={'status':'start','text':text,'msgenvent':textevent} + first = False + self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint) + streamlen -= self.chunk + idx += self.chunk + eventpoint={'status':'end','text':text,'msgenvent':textevent} + self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint) + +########################################################################################### +_PROTOCOL = "https://" +_HOST = "tts.cloud.tencent.com" +_PATH = "/stream" +_ACTION = "TextToStreamAudio" + +class TencentTTS(BaseTTS): + def __init__(self, opt, parent): + super().__init__(opt,parent) + self.appid = os.getenv("TENCENT_APPID") + self.secret_key = os.getenv("TENCENT_SECRET_KEY") + self.secret_id = os.getenv("TENCENT_SECRET_ID") + self.voice_type = int(opt.REF_FILE) + self.codec = "pcm" + self.sample_rate = 16000 + self.volume = 0 + self.speed = 0 + + def __gen_signature(self, params): + sort_dict = sorted(params.keys()) + sign_str = "POST" + _HOST + _PATH + "?" + for key in sort_dict: + sign_str = sign_str + key + "=" + str(params[key]) + '&' + sign_str = sign_str[:-1] + hmacstr = hmac.new(self.secret_key.encode('utf-8'), + sign_str.encode('utf-8'), hashlib.sha1).digest() + s = base64.b64encode(hmacstr) + s = s.decode('utf-8') + return s + + def __gen_params(self, session_id, text): + params = dict() + params['Action'] = _ACTION + params['AppId'] = int(self.appid) + params['SecretId'] = self.secret_id + params['ModelType'] = 1 + params['VoiceType'] = self.voice_type + params['Codec'] = self.codec + params['SampleRate'] = self.sample_rate + params['Speed'] = self.speed + params['Volume'] = self.volume + params['SessionId'] = session_id + params['Text'] = text + + timestamp = int(time.time()) + params['Timestamp'] = timestamp + params['Expired'] = timestamp + 24 * 60 * 60 + return params + + def txt_to_audio(self,msg): + text,textevent = msg + self.stream_tts( + self.tencent_voice( + text, + self.opt.REF_FILE, + self.opt.REF_TEXT, + "zh", #en args.language, + self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url, + ), + msg + ) + + def tencent_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]: + start = time.perf_counter() + session_id = str(uuid.uuid1()) + params = self.__gen_params(session_id, text) + signature = self.__gen_signature(params) + headers = { + "Content-Type": "application/json", + "Authorization": str(signature) + } + url = _PROTOCOL + _HOST + _PATH + try: + res = requests.post(url, headers=headers, + data=json.dumps(params), stream=True) + + end = time.perf_counter() + logger.info(f"tencent Time to make POST: {end-start}s") + + first = True + + for chunk in res.iter_content(chunk_size=6400): # 640 16K*20ms*2 + #logger.info('chunk len:%d',len(chunk)) + if first: + try: + rsp = json.loads(chunk) + #response["Code"] = rsp["Response"]["Error"]["Code"] + #response["Message"] = rsp["Response"]["Error"]["Message"] + logger.error("tencent tts:%s",rsp["Response"]["Error"]["Message"]) + return + except: + end = time.perf_counter() + logger.info(f"tencent Time to first chunk: {end-start}s") + first = False + if chunk and self.state==State.RUNNING: + yield chunk + except Exception as e: + logger.exception('tencent') + + def stream_tts(self,audio_stream,msg): + text,textevent = msg + first = True + last_stream = np.array([],dtype=np.float32) + for chunk in audio_stream: + if chunk is not None and len(chunk)>0: + stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767 + stream = np.concatenate((last_stream,stream)) + #stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate) #byte_stream=BytesIO(buffer) #stream = self.__create_bytes_stream(byte_stream) streamlen = stream.shape[0] @@ -380,10 +510,12 @@ class CosyVoiceTTS(BaseTTS): self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint) streamlen -= self.chunk idx += self.chunk + last_stream = stream[idx:] #get the remain stream eventpoint={'status':'end','text':text,'msgenvent':textevent} self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint) ########################################################################################### + class XTTS(BaseTTS): def __init__(self, opt, parent): super().__init__(opt,parent)