diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index 7b50e646..5b1fe13e 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -97,6 +97,12 @@ class LJSpeechDataset(Dataset): linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] for m in mel] + + # compute 'stop token' targets + stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths] + + # PAD stop targets + stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) @@ -106,7 +112,7 @@ class LJSpeechDataset(Dataset): linear = prepare_tensor(linear) mel = prepare_tensor(mel) assert mel.shape[2] == linear.shape[2] - timesteps = mel.shape[2] + timesteps = mel.shape[2] # PAD with zeros that can be divided by outputs per step if (timesteps + 1) % self.outputs_per_step != 0: @@ -120,12 +126,6 @@ class LJSpeechDataset(Dataset): # update mel lengths mel_lengths = [l+pad_len for l in mel_lengths] - - # compute 'stop token' targets - stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths] - - # PAD stop targets - stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # B x T x D linear = linear.transpose(0, 2, 1)