diff --git a/app.py b/app.py index 65ddb8a..5dc4665 100644 --- a/app.py +++ b/app.py @@ -157,6 +157,29 @@ async def human(request): ), ) +async def humanaudio(request): + try: + form= await request.post() + sessionid = int(form.get('sessionid',0)) + fileobj = form["file"] + filename=fileobj.filename + filebytes=fileobj.file.read() + nerfreals[sessionid].put_audio_file(filebytes) + + return web.Response( + content_type="application/json", + text=json.dumps( + {"code": 0, "msg":"ok"} + ), + ) + except Exception as e: + return web.Response( + content_type="application/json", + text=json.dumps( + {"code": -1, "msg":"err","data": ""+e.args[0]+""} + ), + ) + async def set_audiotype(request): params = await request.json() @@ -455,6 +478,7 @@ if __name__ == '__main__': appasync.on_shutdown.append(on_shutdown) appasync.router.add_post("/offer", offer) appasync.router.add_post("/human", human) + appasync.router.add_post("/humanaudio", humanaudio) appasync.router.add_post("/set_audiotype", set_audiotype) appasync.router.add_post("/record", record) appasync.router.add_post("/is_speaking", is_speaking) diff --git a/basereal.py b/basereal.py index e21e3a7..8836531 100644 --- a/basereal.py +++ b/basereal.py @@ -8,6 +8,7 @@ import cv2 import glob import pickle import copy +import resampy import queue from queue import Queue @@ -64,6 +65,32 @@ class BaseReal: def put_audio_frame(self,audio_chunk): #16khz 20ms pcm self.asr.put_audio_frame(audio_chunk) + def put_audio_file(self,filebyte): + input_stream = BytesIO(filebyte) + stream = self.__create_bytes_stream(input_stream) + streamlen = stream.shape[0] + idx=0 + while streamlen >= self.chunk: #and self.state==State.RUNNING + self.put_audio_frame(stream[idx:idx+self.chunk]) + streamlen -= self.chunk + idx += self.chunk + + def __create_bytes_stream(self,byte_stream): + #byte_stream=BytesIO(buffer) + stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64 + print(f'[INFO]put audio stream {sample_rate}: {stream.shape}') + stream = stream.astype(np.float32) + + if stream.ndim > 1: + print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.') + stream = stream[:, 0] + + if sample_rate != self.sample_rate and stream.shape[0]>0: + print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self.sample_rate}.') + stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate) + + return stream + def pause_talk(self): self.tts.pause_talk() self.asr.pause_talk()