add tencent tts

5 months ago · 12e6727b10
parent 8fa4cd4a3c
commit 12e6727b10
3 changed files with 138 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -11,6 +11,7 @@ Real time interactive streaming digital human， realize audio video synchronous
 - 2024.12.28 添加数字人模型Ultralight-Digital-Human。 感谢@lijihua2017
 - 2025.2.7 添加fish-speech tts
 - 2025.2.21 添加wav2lip256开源模型 感谢@不蠢不蠢
+- 2025.3.2 添加腾讯语音合成服务

 ## Features
 1. 支持多种数字人模型: ernerf、musetalk、wav2lip、Ultralight-Digital-Human
--- a/basereal.py
+++ b/basereal.py
@ -35,7 +35,7 @@ import soundfile as sf
 import av
 from fractions import Fraction

-from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS,FishTTS
+from ttsreal import EdgeTTS,VoitsTTS,XTTS,CosyVoiceTTS,FishTTS,TencentTTS
 from logger import logger

 from tqdm import tqdm
@ -64,6 +64,8 @@ class BaseReal:
            self.tts = CosyVoiceTTS(opt,self)
        elif opt.tts == "fishtts":
            self.tts = FishTTS(opt,self)
+        elif opt.tts == "tencent":
+            self.tts = TencentTTS(opt,self)
        
        self.speaking = False

--- a/ttsreal.py
+++ b/ttsreal.py
@ -22,6 +22,13 @@ import resampy
 import asyncio
 import edge_tts

+import os
+import hmac
+import hashlib
+import base64
+import json
+import uuid
+
 from typing import Iterator

 import requests
@ -351,7 +358,7 @@ class CosyVoiceTTS(BaseTTS):
                
            first = True
        
-            for chunk in res.iter_content(chunk_size=8820): # 882 22.05K*20ms*2
+            for chunk in res.iter_content(chunk_size=9600): # 960 24K*20ms*2
                if first:
                    end = time.perf_counter()
                    logger.info(f"cosy_voice Time to first chunk: {end-start}s")
@ -367,7 +374,7 @@ class CosyVoiceTTS(BaseTTS):
        for chunk in audio_stream:
            if chunk is not None and len(chunk)>0:          
                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
-                stream = resampy.resample(x=stream, sr_orig=22050, sr_new=self.sample_rate)
+                stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
                #byte_stream=BytesIO(buffer)
                #stream = self.__create_bytes_stream(byte_stream)
                streamlen = stream.shape[0]
@ -384,6 +391,131 @@ class CosyVoiceTTS(BaseTTS):
        self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint) 

 ###########################################################################################
+_PROTOCOL = "https://"
+_HOST = "tts.cloud.tencent.com"
+_PATH = "/stream"
+_ACTION = "TextToStreamAudio"
+
+class TencentTTS(BaseTTS):
+    def __init__(self, opt, parent):
+        super().__init__(opt,parent)
+        self.appid = os.getenv("TENCENT_APPID")
+        self.secret_key = os.getenv("TENCENT_SECRET_KEY")
+        self.secret_id = os.getenv("TENCENT_SECRET_ID")
+        self.voice_type = int(opt.REF_FILE)
+        self.codec = "pcm"
+        self.sample_rate = 16000
+        self.volume = 0
+        self.speed = 0
+    
+    def __gen_signature(self, params):
+        sort_dict = sorted(params.keys())
+        sign_str = "POST" + _HOST + _PATH + "?"
+        for key in sort_dict:
+            sign_str = sign_str + key + "=" + str(params[key]) + '&'
+        sign_str = sign_str[:-1]
+        hmacstr = hmac.new(self.secret_key.encode('utf-8'),
+                           sign_str.encode('utf-8'), hashlib.sha1).digest()
+        s = base64.b64encode(hmacstr)
+        s = s.decode('utf-8')
+        return s
+
+    def __gen_params(self, session_id, text):
+        params = dict()
+        params['Action'] = _ACTION
+        params['AppId'] = int(self.appid)
+        params['SecretId'] = self.secret_id
+        params['ModelType'] = 1
+        params['VoiceType'] = self.voice_type
+        params['Codec'] = self.codec
+        params['SampleRate'] = self.sample_rate
+        params['Speed'] = self.speed
+        params['Volume'] = self.volume
+        params['SessionId'] = session_id
+        params['Text'] = text
+
+        timestamp = int(time.time())
+        params['Timestamp'] = timestamp
+        params['Expired'] = timestamp + 24 * 60 * 60
+        return params
+
+    def txt_to_audio(self,msg):
+        text,textevent = msg 
+        self.stream_tts(
+            self.tencent_voice(
+                text,
+                self.opt.REF_FILE,  
+                self.opt.REF_TEXT,
+                "zh", #en args.language,
+                self.opt.TTS_SERVER, #"http://127.0.0.1:5000", #args.server_url,
+            ),
+            msg
+        )
+
+    def tencent_voice(self, text, reffile, reftext,language, server_url) -> Iterator[bytes]:
+        start = time.perf_counter()
+        session_id = str(uuid.uuid1())
+        params = self.__gen_params(session_id, text)
+        signature = self.__gen_signature(params)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": str(signature)
+        }
+        url = _PROTOCOL + _HOST + _PATH
+        try:
+            res = requests.post(url, headers=headers,
+                          data=json.dumps(params), stream=True)
+            
+            end = time.perf_counter()
+            logger.info(f"tencent Time to make POST: {end-start}s")
+                
+            first = True
+        
+            for chunk in res.iter_content(chunk_size=6400): # 640 16K*20ms*2
+                #logger.info('chunk len:%d',len(chunk))
+                if first:
+                    try:
+                        rsp = json.loads(chunk)
+                        #response["Code"] = rsp["Response"]["Error"]["Code"]
+                        #response["Message"] = rsp["Response"]["Error"]["Message"]
+                        logger.error("tencent tts:%s",rsp["Response"]["Error"]["Message"])
+                        return
+                    except:
+                        end = time.perf_counter()
+                        logger.info(f"tencent Time to first chunk: {end-start}s")
+                        first = False                    
+                if chunk and self.state==State.RUNNING:
+                    yield chunk
+        except Exception as e:
+            logger.exception('tencent')
+
+    def stream_tts(self,audio_stream,msg):
+        text,textevent = msg
+        first = True
+        last_stream = np.array([],dtype=np.float32)
+        for chunk in audio_stream:
+            if chunk is not None and len(chunk)>0:          
+                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
+                stream = np.concatenate((last_stream,stream))
+                #stream = resampy.resample(x=stream, sr_orig=24000, sr_new=self.sample_rate)
+                #byte_stream=BytesIO(buffer)
+                #stream = self.__create_bytes_stream(byte_stream)
+                streamlen = stream.shape[0]
+                idx=0
+                while streamlen >= self.chunk:
+                    eventpoint=None
+                    if first:
+                        eventpoint={'status':'start','text':text,'msgenvent':textevent}
+                        first = False
+                    self.parent.put_audio_frame(stream[idx:idx+self.chunk],eventpoint)
+                    streamlen -= self.chunk
+                    idx += self.chunk
+                last_stream = stream[idx:] #get the remain stream
+        eventpoint={'status':'end','text':text,'msgenvent':textevent}
+        self.parent.put_audio_frame(np.zeros(self.chunk,np.float32),eventpoint) 
+
+###########################################################################################
+
 class XTTS(BaseTTS):
    def __init__(self, opt, parent):
        super().__init__(opt,parent)