diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index d213bd9c..a773c661 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -125,6 +125,7 @@ class LJSpeechDataset(Dataset): mel = torch.FloatTensor(mel) mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) + return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ diff --git a/layers/tacotron.py b/layers/tacotron.py index 8433d643..9ec8bffb 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -256,9 +256,12 @@ class Decoder(nn.Module): greedy = not self.training if memory is not None: + print(memory.shape) + # Grouping multiple frames if necessary if memory.size(-1) == self.memory_dim: memory = memory.view(B, memory.size(1) // self.r, -1) + print(memory.shape) assert memory.size(-1) == self.memory_dim * self.r,\ " !! Dimension mismatch {} vs {} * {}".format(memory.size(-1), self.memory_dim, self.r) diff --git a/train.py b/train.py index 626ff105..77288f77 100644 --- a/train.py +++ b/train.py @@ -82,7 +82,7 @@ def train(model, criterion, data_loader, optimizer, epoch): linear_input = data[2] mel_input = data[3] mel_lengths = data[4] - + current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1 # setup lr diff --git a/utils/data.py b/utils/data.py index b0bc9588..4ff3a4c4 100644 --- a/utils/data.py +++ b/utils/data.py @@ -17,13 +17,13 @@ def prepare_data(inputs): def _pad_tensor(x, length): _pad = 0 assert x.ndim == 2 - return np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad) - + x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad) + return x def prepare_tensor(inputs, out_steps): max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame remainder = max_len % out_steps - return np.stack([_pad_tensor(x, max_len + remainder) for x in inputs]) + return np.stack([_pad_tensor(x, max_len + (out_steps - remainder)) for x in inputs]) def _pad_stop_target(x, length): @@ -35,7 +35,7 @@ def _pad_stop_target(x, length): def prepare_stop_target(inputs, out_steps): max_len = max((x.shape[0] for x in inputs)) + 1 # zero-frame remainder = max_len % out_steps - return np.stack([_pad_stop_target(x, max_len + remainder) for x in inputs]) + return np.stack([_pad_stop_target(x, max_len + (out_steps - remainder)) for x in inputs]) def pad_per_step(inputs, pad_len):