From 69863337bc5b7e95722b92ea564699901410980d Mon Sep 17 00:00:00 2001 From: fanpt <320622572@qq.com> Date: Fri, 29 Aug 2025 09:41:28 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=87=AA=E5=8A=A8=E5=A4=84?= =?UTF-8?q?=E7=90=86=E7=94=9F=E6=88=90=E5=90=8E=E5=A3=B0=E9=9F=B3=E9=80=BB?= =?UTF-8?q?=E8=BE=91=EF=BC=9A=EF=BC=88=E9=99=90=E5=88=B6=E5=A3=B0=E9=9F=B3?= =?UTF-8?q?=E6=9C=80=E9=AB=98=E5=B3=B0=E5=80=BC=EF=BC=8C=E5=87=8F=E5=B0=8F?= =?UTF-8?q?=E6=9D=82=E9=9F=B3=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api_v2.py | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 145 insertions(+), 1 deletion(-) diff --git a/api_v2.py b/api_v2.py index 451a48c..6af2b6e 100644 --- a/api_v2.py +++ b/api_v2.py @@ -461,6 +461,134 @@ def check_params(req: dict): return None +# 分贝调整策略 + +def _to_float32_mono(x: np.ndarray) -> np.ndarray: + if x.dtype == np.int16: + return x.astype(np.float32) / 32768.0 + return x.astype(np.float32) + +def _peak_dbfs(xf: np.ndarray) -> float: + if xf.size == 0: + return -float("inf") + peak = float(np.max(np.abs(xf))) + if peak <= 1e-12: + return -float("inf") + return 20.0 * np.log10(peak) + +def _rms_dbfs(xf: np.ndarray) -> float: + if xf.size == 0: + return -float("inf") + rms = float(np.sqrt(np.mean(xf * xf))) + if rms <= 1e-12: + return -float("inf") + return 20.0 * np.log10(rms) + +def _apply_gain_linear(xf: np.ndarray, gain_db: float) -> np.ndarray: + gain = 10.0 ** (gain_db / 20.0) + return xf * gain + +def _limiter_peak(xf: np.ndarray, thresh_db: float = -1.0, soft: bool = True): + """ + 简易峰值限幅器 + thresh_db 为阈值,默认 -1 dBFS + soft 为 True 时使用软限幅(tanh),听感更顺滑 + 返回 (处理后波形, 是否触发限幅) + """ + if xf.size == 0: + return xf, False + thresh_lin = 10.0 ** (thresh_db / 20.0) + peak = float(np.max(np.abs(xf))) + if peak <= thresh_lin or peak <= 1e-12: + return xf, False + if soft: + k = 2.0 + out = np.tanh(k * xf / peak) * thresh_lin + return out.astype(np.float32), True + scale = thresh_lin / peak + out = xf * scale + return out.astype(np.float32), True + +class DynamicGainState: + def __init__( + self, + target_peak_db: float = -1.0, + max_boost_db: float = 18.0, + max_cut_db: float = 24.0, + min_rms_gate_db: float = -45.0, + quiet_boost_cap_db: float = 6.0, + attack_fast: float = 0.25, + release_slow: float = 0.08, + limiter_thresh_db: float = -1.0, + limiter_soft: bool = True, + ): + self.target_peak_db = target_peak_db + self.max_boost_db = max_boost_db + self.max_cut_db = max_cut_db + self.min_rms_gate_db = min_rms_gate_db + self.quiet_boost_cap_db = quiet_boost_cap_db + self.attack_fast = attack_fast + self.release_slow = release_slow + self.limiter_thresh_db = limiter_thresh_db + self.limiter_soft = limiter_soft + self.prev_gain_db = 0.0 + + def compute_chunk(self, x: np.ndarray): + xf = _to_float32_mono(x) + + peak_db = _peak_dbfs(xf) + rms_db = _rms_dbfs(xf) + + if peak_db == -float("inf"): + ideal_gain_db = self.max_boost_db + else: + ideal_gain_db = self.target_peak_db - peak_db + + if rms_db != -float("inf") and rms_db < self.min_rms_gate_db: + ideal_gain_db = min(ideal_gain_db, self.quiet_boost_cap_db) + + ideal_gain_db = max(-self.max_cut_db, min(self.max_boost_db, ideal_gain_db)) + + if ideal_gain_db > self.prev_gain_db: + alpha = self.attack_fast + else: + alpha = self.release_slow + gain_db = self.prev_gain_db + alpha * (ideal_gain_db - self.prev_gain_db) + self.prev_gain_db = gain_db + + y = _apply_gain_linear(xf, gain_db) + y, limited = _limiter_peak(y, self.limiter_thresh_db, soft=self.limiter_soft) + + post_peak_db = _peak_dbfs(y) + post_rms_db = _rms_dbfs(y) + + info = { + "peak_db": peak_db, + "rms_db": rms_db, + "ideal_gain_db": ideal_gain_db, + "applied_gain_db": gain_db, + "limited": limited, + "post_peak_db": post_peak_db, + "post_rms_db": post_rms_db, + } + return y.astype(np.float32), info + +dyn_state = DynamicGainState( + target_peak_db=-1.0, + max_boost_db=18.0, + max_cut_db=24.0, + min_rms_gate_db=-45.0, + quiet_boost_cap_db=6.0, + attack_fast=0.25, + release_slow=0.08, + limiter_thresh_db=-1.0, + limiter_soft=True, +) + + + + + async def tts_handle(req: dict): """ Text to speech handler. @@ -514,11 +642,20 @@ async def tts_handle(req: dict): def streaming_generator(tts_generator: Generator, media_type: str): if_frist_chunk = True for sr, chunk in tts_generator: + processed, info = dyn_state.compute_chunk(chunk) + + print( + f"[响度] 原峰值 {info['peak_db']:.2f} dBFS | 原RMS {info['rms_db']:.2f} dBFS | " + f"理想增益 {info['ideal_gain_db']:.2f} dB | 实际增益 {info['applied_gain_db']:.2f} dB | " + f"限幅 {info['limited']} | 处理后峰值 {info['post_peak_db']:.2f} dBFS | 处理后RMS {info['post_rms_db']:.2f} dBFS" + ) + if if_frist_chunk and media_type == "wav": yield wave_header_chunk(sample_rate=sr) media_type = "raw" if_frist_chunk = False - yield pack_audio(BytesIO(), chunk, sr, media_type).getvalue() + + yield pack_audio(BytesIO(), processed, sr, media_type).getvalue() # _media_type = f"audio/{media_type}" if not (streaming_mode and media_type in ["wav", "raw"]) else f"audio/x-{media_type}" return StreamingResponse( @@ -531,6 +668,13 @@ async def tts_handle(req: dict): else: sr, audio_data = next(tts_generator) + processed, info = dyn_state.compute_chunk(audio_data) + + print( + f"[响度] 原峰值 {info['peak_db']:.2f} dBFS | 原RMS {info['rms_db']:.2f} dBFS | " + f"理想增益 {info['ideal_gain_db']:.2f} dB | 实际增益 {info['applied_gain_db']:.2f} dB | " + f"限幅 {info['limited']} | 处理后峰值 {info['post_peak_db']:.2f} dBFS | 处理后RMS {info['post_rms_db']:.2f} dBFS" + ) audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue() return Response(audio_data, media_type=f"audio/{media_type}") except Exception as e: