You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
37 lines
1.4 KiB
Python
37 lines
1.4 KiB
Python
import time
|
|
import torch
|
|
import numpy as np
|
|
from baseasr import BaseASR
|
|
|
|
# hubert audio feature
|
|
class HubertASR(BaseASR):
|
|
#audio_feat_length: select audio feature before and after
|
|
def __init__(self, opt, parent, audio_processor,audio_feat_length = [8,8]):
|
|
super().__init__(opt, parent)
|
|
self.audio_processor = audio_processor
|
|
#self.stride_left_size = 32
|
|
#self.stride_right_size = 32
|
|
self.audio_feat_length = audio_feat_length
|
|
|
|
|
|
def run_step(self):
|
|
start_time = time.time()
|
|
|
|
for _ in range(self.batch_size * 2):
|
|
audio_frame, type_ = self.get_audio_frame()
|
|
self.frames.append(audio_frame)
|
|
self.output_queue.put((audio_frame, type_))
|
|
|
|
if len(self.frames) <= self.stride_left_size + self.stride_right_size:
|
|
return
|
|
|
|
inputs = np.concatenate(self.frames) # [N * chunk]
|
|
|
|
mel = self.audio_processor.get_hubert_from_16k_speech(inputs)
|
|
mel_chunks=self.audio_processor.feature2chunks(feature_array=mel,fps=self.fps/2,batch_size=self.batch_size,audio_feat_length = self.audio_feat_length, start=self.stride_left_size/2)
|
|
|
|
self.feat_queue.put(mel_chunks)
|
|
self.frames = self.frames[-(self.stride_left_size + self.stride_right_size):]
|
|
#print(f"Processing audio costs {(time.time() - start_time) * 1000}ms")
|
|
|