From f4e53295b18d7b3cefdf1f147330b4148b555aea Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 22 Apr 2022 12:07:37 -0300 Subject: [PATCH] Add asserts for encoder_sample_rate part --- TTS/tts/models/vits.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 33aecbf3..7807efc1 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -737,7 +737,7 @@ class Vits(BaseTTS): self.interpolate_factor = self.config.audio["sample_rate"] / self.args.encoder_sample_rate self.audio_resampler = torchaudio.transforms.Resample( orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate - ) + ) # pylint: disable=W0201 def get_aux_input(self, aux_input: Dict): sid, g, lid = self._set_cond_input(aux_input) @@ -1393,6 +1393,8 @@ class Vits(BaseTTS): if self.args.encoder_sample_rate: # recompute spec with high sampling rate to the loss spec_mel = wav_to_spec(batch["waveform"], ac.fft_size, ac.hop_length, ac.win_length, center=False) + # remove extra stft frame + spec_mel = spec_mel[:, :, : int(batch["spec"].size(2) * self.interpolate_factor)] else: spec_mel = batch["spec"] @@ -1405,14 +1407,20 @@ class Vits(BaseTTS): fmax=ac.mel_fmax, ) - if not self.args.encoder_sample_rate: + if self.args.encoder_sample_rate: + assert batch["spec"].shape[2] == int( + batch["mel"].shape[2] / self.interpolate_factor + ), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" + else: assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" # compute spectrogram frame lengths batch["spec_lens"] = (batch["spec"].shape[2] * batch["waveform_rel_lens"]).int() batch["mel_lens"] = (batch["mel"].shape[2] * batch["waveform_rel_lens"]).int() - if not self.args.encoder_sample_rate: + if self.args.encoder_sample_rate: + assert (batch["spec_lens"] - (batch["mel_lens"] / self.interpolate_factor).int()).sum() == 0 + else: assert (batch["spec_lens"] - batch["mel_lens"]).sum() == 0 # zero the padding frames