|
|
@ -579,8 +579,8 @@ class Text2SemanticDecoder(nn.Module):
|
|
|
|
|
|
|
|
|
|
|
|
####################### update next step ###################################
|
|
|
|
####################### update next step ###################################
|
|
|
|
y_emb = self.ar_audio_embedding(y[:, -1:])
|
|
|
|
y_emb = self.ar_audio_embedding(y[:, -1:])
|
|
|
|
xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, prompts.shape[1] + idx]
|
|
|
|
xy_pos = y_emb * self.ar_audio_position.x_scale + self.ar_audio_position.alpha * self.ar_audio_position.pe[:, y_len + idx].to(dtype=y_emb.dtype,device=y_emb.device)
|
|
|
|
|
|
|
|
|
|
|
|
if ref_free:
|
|
|
|
if ref_free:
|
|
|
|
return y[:, :-1], 0
|
|
|
|
return y[:, :-1], 0
|
|
|
|
return y[:, :-1], idx - 1
|
|
|
|
return y[:, :-1], idx - 1
|