diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index 319c2598..5b7aa4e8 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -193,22 +193,22 @@ class MyDataset(Dataset): mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] linear = [self.ap.spectrogram(w).astype('float32') for w in wav] - mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame + mel_lengths = [m.shape[1] for m in mel] # compute 'stop token' targets stop_targets = [ - np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths + np.array([0.] * (mel_len - 1) + [1.]) for mel_len in mel_lengths ] # PAD stop targets stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) - # PAD sequences with largest length of the batch + # PAD sequences with longest instance in the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) - # PAD features with largest length + a zero frame + # PAD features with longest instance linear = prepare_tensor(linear, self.outputs_per_step) mel = prepare_tensor(mel, self.outputs_per_step) assert mel.shape[2] == linear.shape[2] diff --git a/utils/data.py b/utils/data.py index bbb4a31a..87343ec1 100644 --- a/utils/data.py +++ b/utils/data.py @@ -24,7 +24,7 @@ def _pad_tensor(x, length): def prepare_tensor(inputs, out_steps): - max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame + max_len = max((x.shape[1] for x in inputs)) remainder = max_len % out_steps pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len return np.stack([_pad_tensor(x, pad_len) for x in inputs]) @@ -38,7 +38,8 @@ def _pad_stop_target(x, length): def prepare_stop_target(inputs, out_steps): - max_len = max((x.shape[0] for x in inputs)) + 1 # zero-frame + """ Pad row vectors with 1. """ + max_len = max((x.shape[0] for x in inputs)) remainder = max_len % out_steps pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len return np.stack([_pad_stop_target(x, pad_len) for x in inputs])