|
|
|
@ -223,7 +223,7 @@ class TextEncoder(nn.Module):
|
|
|
|
|
|
|
|
|
|
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
|
|
|
|
|
|
|
|
|
def forward(self, y, y_lengths, text, text_lengths, ge, test=None):
|
|
|
|
|
def forward(self, y, y_lengths, text, text_lengths, ge, speed=1,test=None):
|
|
|
|
|
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, y.size(2)), 1).to(
|
|
|
|
|
y.dtype
|
|
|
|
|
)
|
|
|
|
@ -240,9 +240,10 @@ class TextEncoder(nn.Module):
|
|
|
|
|
text = self.text_embedding(text).transpose(1, 2)
|
|
|
|
|
text = self.encoder_text(text * text_mask, text_mask)
|
|
|
|
|
y = self.mrte(y, y_mask, text, text_mask, ge)
|
|
|
|
|
|
|
|
|
|
y = self.encoder2(y * y_mask, y_mask)
|
|
|
|
|
|
|
|
|
|
if(speed!=1):
|
|
|
|
|
y = F.interpolate(y, size=int(y.shape[-1] * speed)+1, mode="linear")
|
|
|
|
|
y_mask = F.interpolate(y_mask, size=y.shape[-1], mode="nearest")
|
|
|
|
|
stats = self.proj(y) * y_mask
|
|
|
|
|
m, logs = torch.split(stats, self.out_channels, dim=1)
|
|
|
|
|
return y, m, logs, y_mask
|
|
|
|
@ -966,7 +967,7 @@ class SynthesizerTrn(nn.Module):
|
|
|
|
|
return o, y_mask, (z, z_p, m_p, logs_p)
|
|
|
|
|
|
|
|
|
|
@torch.no_grad()
|
|
|
|
|
def decode(self, codes, text, refer, noise_scale=0.5):
|
|
|
|
|
def decode(self, codes, text, refer, noise_scale=0.5,speed=1):
|
|
|
|
|
ge = None
|
|
|
|
|
if refer is not None:
|
|
|
|
|
refer_lengths = torch.LongTensor([refer.size(2)]).to(refer.device)
|
|
|
|
@ -983,9 +984,8 @@ class SynthesizerTrn(nn.Module):
|
|
|
|
|
quantized = F.interpolate(
|
|
|
|
|
quantized, size=int(quantized.shape[-1] * 2), mode="nearest"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
x, m_p, logs_p, y_mask = self.enc_p(
|
|
|
|
|
quantized, y_lengths, text, text_lengths, ge
|
|
|
|
|
quantized, y_lengths, text, text_lengths, ge,speed
|
|
|
|
|
)
|
|
|
|
|
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
|
|
|
|
|
|
|
|
|