添加自动处理生成后声音逻辑:(限制声音最高峰值,减小杂音)

main
fanpt 4 weeks ago
parent 33eeba6873
commit 69863337bc

@ -461,6 +461,134 @@ def check_params(req: dict):
return None
# 分贝调整策略
def _to_float32_mono(x: np.ndarray) -> np.ndarray:
if x.dtype == np.int16:
return x.astype(np.float32) / 32768.0
return x.astype(np.float32)
def _peak_dbfs(xf: np.ndarray) -> float:
if xf.size == 0:
return -float("inf")
peak = float(np.max(np.abs(xf)))
if peak <= 1e-12:
return -float("inf")
return 20.0 * np.log10(peak)
def _rms_dbfs(xf: np.ndarray) -> float:
if xf.size == 0:
return -float("inf")
rms = float(np.sqrt(np.mean(xf * xf)))
if rms <= 1e-12:
return -float("inf")
return 20.0 * np.log10(rms)
def _apply_gain_linear(xf: np.ndarray, gain_db: float) -> np.ndarray:
gain = 10.0 ** (gain_db / 20.0)
return xf * gain
def _limiter_peak(xf: np.ndarray, thresh_db: float = -1.0, soft: bool = True):
"""
简易峰值限幅器
thresh_db 为阈值默认 -1 dBFS
soft True 时使用软限幅tanh听感更顺滑
返回 (处理后波形, 是否触发限幅)
"""
if xf.size == 0:
return xf, False
thresh_lin = 10.0 ** (thresh_db / 20.0)
peak = float(np.max(np.abs(xf)))
if peak <= thresh_lin or peak <= 1e-12:
return xf, False
if soft:
k = 2.0
out = np.tanh(k * xf / peak) * thresh_lin
return out.astype(np.float32), True
scale = thresh_lin / peak
out = xf * scale
return out.astype(np.float32), True
class DynamicGainState:
def __init__(
self,
target_peak_db: float = -1.0,
max_boost_db: float = 18.0,
max_cut_db: float = 24.0,
min_rms_gate_db: float = -45.0,
quiet_boost_cap_db: float = 6.0,
attack_fast: float = 0.25,
release_slow: float = 0.08,
limiter_thresh_db: float = -1.0,
limiter_soft: bool = True,
):
self.target_peak_db = target_peak_db
self.max_boost_db = max_boost_db
self.max_cut_db = max_cut_db
self.min_rms_gate_db = min_rms_gate_db
self.quiet_boost_cap_db = quiet_boost_cap_db
self.attack_fast = attack_fast
self.release_slow = release_slow
self.limiter_thresh_db = limiter_thresh_db
self.limiter_soft = limiter_soft
self.prev_gain_db = 0.0
def compute_chunk(self, x: np.ndarray):
xf = _to_float32_mono(x)
peak_db = _peak_dbfs(xf)
rms_db = _rms_dbfs(xf)
if peak_db == -float("inf"):
ideal_gain_db = self.max_boost_db
else:
ideal_gain_db = self.target_peak_db - peak_db
if rms_db != -float("inf") and rms_db < self.min_rms_gate_db:
ideal_gain_db = min(ideal_gain_db, self.quiet_boost_cap_db)
ideal_gain_db = max(-self.max_cut_db, min(self.max_boost_db, ideal_gain_db))
if ideal_gain_db > self.prev_gain_db:
alpha = self.attack_fast
else:
alpha = self.release_slow
gain_db = self.prev_gain_db + alpha * (ideal_gain_db - self.prev_gain_db)
self.prev_gain_db = gain_db
y = _apply_gain_linear(xf, gain_db)
y, limited = _limiter_peak(y, self.limiter_thresh_db, soft=self.limiter_soft)
post_peak_db = _peak_dbfs(y)
post_rms_db = _rms_dbfs(y)
info = {
"peak_db": peak_db,
"rms_db": rms_db,
"ideal_gain_db": ideal_gain_db,
"applied_gain_db": gain_db,
"limited": limited,
"post_peak_db": post_peak_db,
"post_rms_db": post_rms_db,
}
return y.astype(np.float32), info
dyn_state = DynamicGainState(
target_peak_db=-1.0,
max_boost_db=18.0,
max_cut_db=24.0,
min_rms_gate_db=-45.0,
quiet_boost_cap_db=6.0,
attack_fast=0.25,
release_slow=0.08,
limiter_thresh_db=-1.0,
limiter_soft=True,
)
async def tts_handle(req: dict):
"""
Text to speech handler.
@ -514,11 +642,20 @@ async def tts_handle(req: dict):
def streaming_generator(tts_generator: Generator, media_type: str):
if_frist_chunk = True
for sr, chunk in tts_generator:
processed, info = dyn_state.compute_chunk(chunk)
print(
f"[响度] 原峰值 {info['peak_db']:.2f} dBFS | 原RMS {info['rms_db']:.2f} dBFS | "
f"理想增益 {info['ideal_gain_db']:.2f} dB | 实际增益 {info['applied_gain_db']:.2f} dB | "
f"限幅 {info['limited']} | 处理后峰值 {info['post_peak_db']:.2f} dBFS | 处理后RMS {info['post_rms_db']:.2f} dBFS"
)
if if_frist_chunk and media_type == "wav":
yield wave_header_chunk(sample_rate=sr)
media_type = "raw"
if_frist_chunk = False
yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue()
yield pack_audio(BytesIO(), processed, sr, media_type).getvalue()
# _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}"
return StreamingResponse(
@ -531,6 +668,13 @@ async def tts_handle(req: dict):
else:
sr, audio_data = next(tts_generator)
processed, info = dyn_state.compute_chunk(audio_data)
print(
f"[响度] 原峰值 {info['peak_db']:.2f} dBFS | 原RMS {info['rms_db']:.2f} dBFS | "
f"理想增益 {info['ideal_gain_db']:.2f} dB | 实际增益 {info['applied_gain_db']:.2f} dB | "
f"限幅 {info['limited']} | 处理后峰值 {info['post_peak_db']:.2f} dBFS | 处理后RMS {info['post_rms_db']:.2f} dBFS"
)
audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue()
return Response(audio_data, media_type=f"audio/{media_type}")
except Exception as e:

Loading…
Cancel
Save