|
|
|
@ -22,6 +22,13 @@ import resampy
|
|
|
|
|
import asyncio
|
|
|
|
|
import edge_tts
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import hmac
|
|
|
|
|
import hashlib
|
|
|
|
|
import base64
|
|
|
|
|
import json
|
|
|
|
|
import uuid
|
|
|
|
|
|
|
|
|
|
from typing import Iterator
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
@ -351,7 +358,7 @@ class CosyVoiceTTS(BaseTTS):
|
|
|
|
|
|
|
|
|
|
first = True
|
|
|
|
|
|
|
|
|
|
for chunk in res.iter_content(chunk_size=8820): # 882 22.05K*20ms*2
|
|
|
|
|
for chunk in res.iter_content(chunk_size=9600): # 960 24K*20ms*2
|
|
|
|
|
if first:
|
|
|
|
|
end = time.perf_counter()
|
|
|
|
|
logger.info(f"cosy_voice Time to first chunk: {end-start}s")
|
|
|
|
@ -367,7 +374,7 @@ class CosyVoiceTTS(BaseTTS):
|
|
|
|
|
for chunk in audio_stream:
|
|
|
|
|
if chunk is not None and len(chunk)>0:
|
|
|
|
|
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
|
|
|
|
stream = resampy.resample(x=stream, sr_orig=22050, sr_new=self.sample_rate)
|
|
|
|
|
stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
|
|
|
|
|
#byte_stream=BytesIO(buffer)
|
|
|
|
|
#stream = self.__create_bytes_stream(byte_stream)
|
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
@ -384,6 +391,131 @@ class CosyVoiceTTS(BaseTTS):
|
|
|
|
|
self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint)
|
|
|
|
|
|
|
|
|
|
###########################################################################################
|
|
|
|
|
_PROTOCOL = "https://"
|
|
|
|
|
_HOST = "tts.cloud.tencent.com"
|
|
|
|
|
_PATH = "/stream"
|
|
|
|
|
_ACTION = "TextToStreamAudio"
|
|
|
|
|
|
|
|
|
|
class TencentTTS(BaseTTS):
|
|
|
|
|
def __init__(self, opt, parent):
|
|
|
|
|
super().__init__(opt,parent)
|
|
|
|
|
self.appid = os.getenv("TENCENT_APPID")
|
|
|
|
|
self.secret_key = os.getenv("TENCENT_SECRET_KEY")
|
|
|
|
|
self.secret_id = os.getenv("TENCENT_SECRET_ID")
|
|
|
|
|
self.voice_type = int(opt.REF_FILE)
|
|
|
|
|
self.codec = "pcm"
|
|
|
|
|
self.sample_rate = 16000
|
|
|
|
|
self.volume = 0
|
|
|
|
|
self.speed = 0
|
|
|
|
|
|
|
|
|
|
def __gen_signature(self, params):
|
|
|
|
|
sort_dict = sorted(params.keys())
|
|
|
|
|
sign_str = "POST" + _HOST + _PATH + "?"
|
|
|
|
|
for key in sort_dict:
|
|
|
|
|
sign_str = sign_str + key + "=" + str(params[key]) + '&'
|
|
|
|
|
sign_str = sign_str[:-1]
|
|
|
|
|
hmacstr = hmac.new(self.secret_key.encode('utf-8'),
|
|
|
|
|
sign_str.encode('utf-8'), hashlib.sha1).digest()
|
|
|
|
|
s = base64.b64encode(hmacstr)
|
|
|
|
|
s = s.decode('utf-8')
|
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
def __gen_params(self, session_id, text):
|
|
|
|
|
params = dict()
|
|
|
|
|
params['Action'] = _ACTION
|
|
|
|
|
params['AppId'] = int(self.appid)
|
|
|
|
|
params['SecretId'] = self.secret_id
|
|
|
|
|
params['ModelType'] = 1
|
|
|
|
|
params['VoiceType'] = self.voice_type
|
|
|
|
|
params['Codec'] = self.codec
|
|
|
|
|
params['SampleRate'] = self.sample_rate
|
|
|
|
|
params['Speed'] = self.speed
|
|
|
|
|
params['Volume'] = self.volume
|
|
|
|
|
params['SessionId'] = session_id
|
|
|
|
|
params['Text'] = text
|
|
|
|
|
|
|
|
|
|
timestamp = int(time.time())
|
|
|
|
|
params['Timestamp'] = timestamp
|
|
|
|
|
params['Expired'] = timestamp + 24 * 60 * 60
|
|
|
|
|
return params
|
|
|
|
|
|
|
|
|
|
def txt_to_audio(self,msg):
|
|
|
|
|
text,textevent = msg
|
|
|
|
|
self.stream_tts(
|
|
|
|
|
self.tencent_voice(
|
|
|
|
|
text,
|
|
|
|
|
self.opt.REF_FILE,
|
|
|
|
|
self.opt.REF_TEXT,
|
|
|
|
|
"zh", #en args.language,
|
|
|
|
|
self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
|
|
|
|
|
),
|
|
|
|
|
msg
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def tencent_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
|
|
|
|
|
start = time.perf_counter()
|
|
|
|
|
session_id = str(uuid.uuid1())
|
|
|
|
|
params = self.__gen_params(session_id, text)
|
|
|
|
|
signature = self.__gen_signature(params)
|
|
|
|
|
headers = {
|
|
|
|
|
"Content-Type": "application/json",
|
|
|
|
|
"Authorization": str(signature)
|
|
|
|
|
}
|
|
|
|
|
url = _PROTOCOL + _HOST + _PATH
|
|
|
|
|
try:
|
|
|
|
|
res = requests.post(url, headers=headers,
|
|
|
|
|
data=json.dumps(params), stream=True)
|
|
|
|
|
|
|
|
|
|
end = time.perf_counter()
|
|
|
|
|
logger.info(f"tencent Time to make POST: {end-start}s")
|
|
|
|
|
|
|
|
|
|
first = True
|
|
|
|
|
|
|
|
|
|
for chunk in res.iter_content(chunk_size=6400): # 640 16K*20ms*2
|
|
|
|
|
#logger.info('chunk len:%d',len(chunk))
|
|
|
|
|
if first:
|
|
|
|
|
try:
|
|
|
|
|
rsp = json.loads(chunk)
|
|
|
|
|
#response["Code"] = rsp["Response"]["Error"]["Code"]
|
|
|
|
|
#response["Message"] = rsp["Response"]["Error"]["Message"]
|
|
|
|
|
logger.error("tencent tts:%s",rsp["Response"]["Error"]["Message"])
|
|
|
|
|
return
|
|
|
|
|
except:
|
|
|
|
|
end = time.perf_counter()
|
|
|
|
|
logger.info(f"tencent Time to first chunk: {end-start}s")
|
|
|
|
|
first = False
|
|
|
|
|
if chunk and self.state==State.RUNNING:
|
|
|
|
|
yield chunk
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.exception('tencent')
|
|
|
|
|
|
|
|
|
|
def stream_tts(self,audio_stream,msg):
|
|
|
|
|
text,textevent = msg
|
|
|
|
|
first = True
|
|
|
|
|
last_stream = np.array([],dtype=np.float32)
|
|
|
|
|
for chunk in audio_stream:
|
|
|
|
|
if chunk is not None and len(chunk)>0:
|
|
|
|
|
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
|
|
|
|
|
stream = np.concatenate((last_stream,stream))
|
|
|
|
|
#stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
|
|
|
|
|
#byte_stream=BytesIO(buffer)
|
|
|
|
|
#stream = self.__create_bytes_stream(byte_stream)
|
|
|
|
|
streamlen = stream.shape[0]
|
|
|
|
|
idx=0
|
|
|
|
|
while streamlen >= self.chunk:
|
|
|
|
|
eventpoint=None
|
|
|
|
|
if first:
|
|
|
|
|
eventpoint={'status':'start','text':text,'msgenvent':textevent}
|
|
|
|
|
first = False
|
|
|
|
|
self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
|
|
|
|
|
streamlen -= self.chunk
|
|
|
|
|
idx += self.chunk
|
|
|
|
|
last_stream = stream[idx:] #get the remain stream
|
|
|
|
|
eventpoint={'status':'end','text':text,'msgenvent':textevent}
|
|
|
|
|
self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint)
|
|
|
|
|
|
|
|
|
|
###########################################################################################
|
|
|
|
|
|
|
|
|
|
class XTTS(BaseTTS):
|
|
|
|
|
def __init__(self, opt, parent):
|
|
|
|
|
super().__init__(opt,parent)
|
|
|
|
|