|
|
@ -31,9 +31,11 @@ warnings.simplefilter(action="ignore", category=FutureWarning)
|
|
|
|
|
|
|
|
|
|
|
|
version = model_version = os.environ.get("version", "v2")
|
|
|
|
version = model_version = os.environ.get("version", "v2")
|
|
|
|
|
|
|
|
|
|
|
|
from config import name2sovits_path,name2gpt_path,change_choices,get_weights_names
|
|
|
|
from config import change_choices, get_weights_names, name2gpt_path, name2sovits_path
|
|
|
|
|
|
|
|
|
|
|
|
SoVITS_names, GPT_names = get_weights_names()
|
|
|
|
SoVITS_names, GPT_names = get_weights_names()
|
|
|
|
from config import pretrained_sovits_name
|
|
|
|
from config import pretrained_sovits_name
|
|
|
|
|
|
|
|
|
|
|
|
path_sovits_v3 = pretrained_sovits_name["v3"]
|
|
|
|
path_sovits_v3 = pretrained_sovits_name["v3"]
|
|
|
|
path_sovits_v4 = pretrained_sovits_name["v4"]
|
|
|
|
path_sovits_v4 = pretrained_sovits_name["v4"]
|
|
|
|
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
|
|
|
is_exist_s2gv3 = os.path.exists(path_sovits_v3)
|
|
|
@ -108,6 +110,7 @@ from peft import LoraConfig, get_peft_model
|
|
|
|
from text import cleaned_text_to_sequence
|
|
|
|
from text import cleaned_text_to_sequence
|
|
|
|
from text.cleaner import clean_text
|
|
|
|
from text.cleaner import clean_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from tools.assets import css, js, top_html
|
|
|
|
from tools.i18n.i18n import I18nAuto, scan_language_list
|
|
|
|
from tools.i18n.i18n import I18nAuto, scan_language_list
|
|
|
|
|
|
|
|
|
|
|
|
language = os.environ.get("language", "Auto")
|
|
|
|
language = os.environ.get("language", "Auto")
|
|
|
@ -208,8 +211,11 @@ else:
|
|
|
|
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
|
|
|
from process_ckpt import get_sovits_version_from_path_fast, load_sovits_new
|
|
|
|
|
|
|
|
|
|
|
|
v3v4set = {"v3", "v4"}
|
|
|
|
v3v4set = {"v3", "v4"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
|
|
|
|
def change_sovits_weights(sovits_path, prompt_language=None, text_language=None):
|
|
|
|
if "!"in sovits_path:sovits_path=name2sovits_path[sovits_path]
|
|
|
|
if "!" in sovits_path:
|
|
|
|
|
|
|
|
sovits_path = name2sovits_path[sovits_path]
|
|
|
|
global vq_model, hps, version, model_version, dict_language, if_lora_v3
|
|
|
|
global vq_model, hps, version, model_version, dict_language, if_lora_v3
|
|
|
|
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
|
|
|
|
version, model_version, if_lora_v3 = get_sovits_version_from_path_fast(sovits_path)
|
|
|
|
print(sovits_path, version, model_version, if_lora_v3)
|
|
|
|
print(sovits_path, version, model_version, if_lora_v3)
|
|
|
@ -355,7 +361,8 @@ except:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def change_gpt_weights(gpt_path):
|
|
|
|
def change_gpt_weights(gpt_path):
|
|
|
|
if "!"in gpt_path:gpt_path=name2gpt_path[gpt_path]
|
|
|
|
if "!" in gpt_path:
|
|
|
|
|
|
|
|
gpt_path = name2gpt_path[gpt_path]
|
|
|
|
global hz, max_sec, t2s_model, config
|
|
|
|
global hz, max_sec, t2s_model, config
|
|
|
|
hz = 50
|
|
|
|
hz = 50
|
|
|
|
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False)
|
|
|
|
dict_s1 = torch.load(gpt_path, map_location="cpu", weights_only=False)
|
|
|
@ -383,6 +390,7 @@ import torch
|
|
|
|
|
|
|
|
|
|
|
|
now_dir = os.getcwd()
|
|
|
|
now_dir = os.getcwd()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_hifigan_model():
|
|
|
|
def clean_hifigan_model():
|
|
|
|
global hifigan_model
|
|
|
|
global hifigan_model
|
|
|
|
if hifigan_model:
|
|
|
|
if hifigan_model:
|
|
|
@ -392,6 +400,8 @@ def clean_hifigan_model():
|
|
|
|
torch.cuda.empty_cache()
|
|
|
|
torch.cuda.empty_cache()
|
|
|
|
except:
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_bigvgan_model():
|
|
|
|
def clean_bigvgan_model():
|
|
|
|
global bigvgan_model
|
|
|
|
global bigvgan_model
|
|
|
|
if bigvgan_model:
|
|
|
|
if bigvgan_model:
|
|
|
@ -401,6 +411,8 @@ def clean_bigvgan_model():
|
|
|
|
torch.cuda.empty_cache()
|
|
|
|
torch.cuda.empty_cache()
|
|
|
|
except:
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_sv_cn_model():
|
|
|
|
def clean_sv_cn_model():
|
|
|
|
global sv_cn_model
|
|
|
|
global sv_cn_model
|
|
|
|
if sv_cn_model:
|
|
|
|
if sv_cn_model:
|
|
|
@ -411,6 +423,7 @@ def clean_sv_cn_model():
|
|
|
|
except:
|
|
|
|
except:
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def init_bigvgan():
|
|
|
|
def init_bigvgan():
|
|
|
|
global bigvgan_model, hifigan_model, sv_cn_model
|
|
|
|
global bigvgan_model, hifigan_model, sv_cn_model
|
|
|
|
from BigVGAN import bigvgan
|
|
|
|
from BigVGAN import bigvgan
|
|
|
@ -429,6 +442,7 @@ def init_bigvgan():
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
bigvgan_model = bigvgan_model.to(device)
|
|
|
|
bigvgan_model = bigvgan_model.to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def init_hifigan():
|
|
|
|
def init_hifigan():
|
|
|
|
global hifigan_model, bigvgan_model, sv_cn_model
|
|
|
|
global hifigan_model, bigvgan_model, sv_cn_model
|
|
|
|
hifigan_model = Generator(
|
|
|
|
hifigan_model = Generator(
|
|
|
@ -445,7 +459,9 @@ def init_hifigan():
|
|
|
|
hifigan_model.eval()
|
|
|
|
hifigan_model.eval()
|
|
|
|
hifigan_model.remove_weight_norm()
|
|
|
|
hifigan_model.remove_weight_norm()
|
|
|
|
state_dict_g = torch.load(
|
|
|
|
state_dict_g = torch.load(
|
|
|
|
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,), map_location="cpu", weights_only=False
|
|
|
|
"%s/GPT_SoVITS/pretrained_models/gsv-v4-pretrained/vocoder.pth" % (now_dir,),
|
|
|
|
|
|
|
|
map_location="cpu",
|
|
|
|
|
|
|
|
weights_only=False,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
|
|
|
print("loading vocoder", hifigan_model.load_state_dict(state_dict_g))
|
|
|
|
clean_bigvgan_model()
|
|
|
|
clean_bigvgan_model()
|
|
|
@ -455,7 +471,10 @@ def init_hifigan():
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
hifigan_model = hifigan_model.to(device)
|
|
|
|
hifigan_model = hifigan_model.to(device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sv import SV
|
|
|
|
from sv import SV
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def init_sv_cn():
|
|
|
|
def init_sv_cn():
|
|
|
|
global hifigan_model, bigvgan_model, sv_cn_model
|
|
|
|
global hifigan_model, bigvgan_model, sv_cn_model
|
|
|
|
sv_cn_model = SV(device, is_half)
|
|
|
|
sv_cn_model = SV(device, is_half)
|
|
|
@ -472,15 +491,16 @@ if model_version in {"v2Pro","v2ProPlus"}:
|
|
|
|
init_sv_cn()
|
|
|
|
init_sv_cn()
|
|
|
|
|
|
|
|
|
|
|
|
resample_transform_dict = {}
|
|
|
|
resample_transform_dict = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def resample(audio_tensor, sr0, sr1, device):
|
|
|
|
def resample(audio_tensor, sr0, sr1, device):
|
|
|
|
global resample_transform_dict
|
|
|
|
global resample_transform_dict
|
|
|
|
key = "%s-%s-%s" % (sr0, sr1, str(device))
|
|
|
|
key = "%s-%s-%s" % (sr0, sr1, str(device))
|
|
|
|
if key not in resample_transform_dict:
|
|
|
|
if key not in resample_transform_dict:
|
|
|
|
resample_transform_dict[key] = torchaudio.transforms.Resample(
|
|
|
|
resample_transform_dict[key] = torchaudio.transforms.Resample(sr0, sr1).to(device)
|
|
|
|
sr0, sr1
|
|
|
|
|
|
|
|
).to(device)
|
|
|
|
|
|
|
|
return resample_transform_dict[key](audio_tensor)
|
|
|
|
return resample_transform_dict[key](audio_tensor)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_spepc(hps, filename, dtype, device, is_v2pro=False):
|
|
|
|
def get_spepc(hps, filename, dtype, device, is_v2pro=False):
|
|
|
|
# audio = load_audio(filename, int(hps.data.sampling_rate))
|
|
|
|
# audio = load_audio(filename, int(hps.data.sampling_rate))
|
|
|
|
|
|
|
|
|
|
|
@ -491,11 +511,13 @@ def get_spepc(hps, filename,dtype,device,is_v2pro=False):
|
|
|
|
audio, sr0 = torchaudio.load(filename)
|
|
|
|
audio, sr0 = torchaudio.load(filename)
|
|
|
|
if sr0 != sr1:
|
|
|
|
if sr0 != sr1:
|
|
|
|
audio = audio.to(device)
|
|
|
|
audio = audio.to(device)
|
|
|
|
if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
|
|
|
|
if audio.shape[0] == 2:
|
|
|
|
|
|
|
|
audio = audio.mean(0).unsqueeze(0)
|
|
|
|
audio = resample(audio, sr0, sr1, device)
|
|
|
|
audio = resample(audio, sr0, sr1, device)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
audio = audio.to(device)
|
|
|
|
audio = audio.to(device)
|
|
|
|
if(audio.shape[0]==2):audio=audio.mean(0).unsqueeze(0)
|
|
|
|
if audio.shape[0] == 2:
|
|
|
|
|
|
|
|
audio = audio.mean(0).unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
maxx = audio.abs().max()
|
|
|
|
maxx = audio.abs().max()
|
|
|
|
if maxx > 1:
|
|
|
|
if maxx > 1:
|
|
|
@ -875,9 +897,13 @@ def get_tts_wav(
|
|
|
|
if is_v2pro:
|
|
|
|
if is_v2pro:
|
|
|
|
sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)]
|
|
|
|
sv_emb = [sv_cn_model.compute_embedding3(audio_tensor)]
|
|
|
|
if is_v2pro:
|
|
|
|
if is_v2pro:
|
|
|
|
audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed,sv_emb=sv_emb)[0][0]
|
|
|
|
audio = vq_model.decode(
|
|
|
|
|
|
|
|
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed, sv_emb=sv_emb
|
|
|
|
|
|
|
|
)[0][0]
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
audio = vq_model.decode(pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed)[0][0]
|
|
|
|
audio = vq_model.decode(
|
|
|
|
|
|
|
|
pred_semantic, torch.LongTensor(phones2).to(device).unsqueeze(0), refers, speed=speed
|
|
|
|
|
|
|
|
)[0][0]
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
|
|
|
|
refer, audio_tensor = get_spepc(hps, ref_wav_path, dtype, device)
|
|
|
|
phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
|
|
|
|
phoneme_ids0 = torch.LongTensor(phones1).to(device).unsqueeze(0)
|
|
|
@ -1076,6 +1102,7 @@ def process_text(texts):
|
|
|
|
_text.append(text)
|
|
|
|
_text.append(text)
|
|
|
|
return _text
|
|
|
|
return _text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def html_center(text, label="p"):
|
|
|
|
def html_center(text, label="p"):
|
|
|
|
return f"""<div style="text-align: center; margin: 100; padding: 50;">
|
|
|
|
return f"""<div style="text-align: center; margin: 100; padding: 50;">
|
|
|
|
<{label} style="margin: 0; padding: 0;">{text}</{label}>
|
|
|
|
<{label} style="margin: 0; padding: 0;">{text}</{label}>
|
|
|
@ -1088,11 +1115,13 @@ def html_left(text, label="p"):
|
|
|
|
</div>"""
|
|
|
|
</div>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False) as app:
|
|
|
|
with gr.Blocks(title="GPT-SoVITS WebUI", analytics_enabled=False, js=js, css=css) as app:
|
|
|
|
gr.Markdown(
|
|
|
|
gr.HTML(
|
|
|
|
value=i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
|
|
|
|
top_html.format(
|
|
|
|
+ "<br>"
|
|
|
|
i18n("本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.")
|
|
|
|
+ i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
|
|
|
|
+ i18n("如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.")
|
|
|
|
|
|
|
|
),
|
|
|
|
|
|
|
|
elem_classes="markdown",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
with gr.Group():
|
|
|
|
with gr.Group():
|
|
|
|
gr.Markdown(html_center(i18n("模型切换"), "h3"))
|
|
|
|
gr.Markdown(html_center(i18n("模型切换"), "h3"))
|
|
|
|