diff --git a/config.json b/config.json index 0bd21b4e..63f6d372 100644 --- a/config.json +++ b/config.json @@ -12,7 +12,7 @@ "text_cleaner": "english_cleaners", "epochs": 2000, - "lr": 0.0003, + "lr": 0.001, "warmup_steps": 4000, "batch_size": 32, "eval_batch_size":32, diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index 5b1fe13e..d213bd9c 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -7,7 +7,7 @@ from torch.utils.data import Dataset from TTS.utils.text import text_to_sequence from TTS.utils.audio import AudioProcessor -from TTS.utils.data import (prepare_data, pad_data, pad_per_step, +from TTS.utils.data import (prepare_data, pad_per_step, prepare_tensor, prepare_stop_target) @@ -96,10 +96,10 @@ class LJSpeechDataset(Dataset): linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] - mel_lengths = [m.shape[1] for m in mel] + mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame # compute 'stop token' targets - stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths] + stop_targets = [np.array([0.]*(mel_len-1)) for mel_len in mel_lengths] # PAD stop targets stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) @@ -108,25 +108,12 @@ class LJSpeechDataset(Dataset): text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) - # PAD features with largest length of the batch - linear = prepare_tensor(linear) - mel = prepare_tensor(mel) + # PAD features with largest length + a zero frame + linear = prepare_tensor(linear, self.outputs_per_step) + mel = prepare_tensor(mel, self.outputs_per_step) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] - # PAD with zeros that can be divided by outputs per step - if (timesteps + 1) % self.outputs_per_step != 0: - pad_len = self.outputs_per_step - \ - ((timesteps + 1) % self.outputs_per_step) - pad_len += 1 - else: - pad_len = 1 - linear = pad_per_step(linear, pad_len) - mel = pad_per_step(mel, pad_len) - - # update mel lengths - mel_lengths = [l+pad_len for l in mel_lengths] - # B x T x D linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) diff --git a/layers/attention.py b/layers/attention.py index 1f83c169..1626e949 100644 --- a/layers/attention.py +++ b/layers/attention.py @@ -48,7 +48,7 @@ class AttentionRNN(nn.Module): def __init__(self, out_dim, annot_dim, memory_dim, score_mask_value=-float("inf")): super(AttentionRNN, self).__init__() - self.rnn_cell = nn.GRUCell(annot_dim + memory_dim, out_dim) + self.rnn_cell = nn.GRUCell(out_dim + memory_dim, out_dim) self.alignment_model = BahdanauAttention(annot_dim, out_dim, out_dim) self.score_mask_value = score_mask_value diff --git a/layers/tacotron.py b/layers/tacotron.py index 983855d4..8433d643 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -304,8 +304,7 @@ class Decoder(nn.Module): # Attention RNN attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn( - processed_memory, current_context_vec, attention_rnn_hidden, - inputs) + processed_memory, current_context_vec, attention_rnn_hidden, inputs) # Concat RNN output and attention context vector decoder_input = self.project_to_decoder_in( diff --git a/tests/layers_tests.py b/tests/layers_tests.py index 246fce8c..570b474c 100644 --- a/tests/layers_tests.py +++ b/tests/layers_tests.py @@ -33,17 +33,15 @@ class CBHGTests(unittest.TestCase): class DecoderTests(unittest.TestCase): def test_in_out(self): - layer = Decoder(in_features=128, memory_dim=32, r=5) - dummy_input = T.autograd.Variable(T.rand(4, 8, 128)) - dummy_memory = T.autograd.Variable(T.rand(4, 120, 32)) + layer = Decoder(in_features=256, memory_dim=80, r=2) + dummy_input = T.autograd.Variable(T.rand(4, 8, 256)) + dummy_memory = T.autograd.Variable(T.rand(4, 2, 80)) - print(layer) output, alignment = layer(dummy_input, dummy_memory) - print(output.shape) assert output.shape[0] == 4 - assert output.shape[1] == 120 / 5 - assert output.shape[2] == 32 * 5 + assert output.shape[1] == 1, "size not {}".format(output.shape[1]) + assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2]) class EncoderTests(unittest.TestCase): diff --git a/tests/loader_tests.py b/tests/loader_tests.py index 3b3d017c..769fbebe 100644 --- a/tests/loader_tests.py +++ b/tests/loader_tests.py @@ -72,8 +72,9 @@ class TestDataset(unittest.TestCase): c.power ) + # Test for batch size 1 dataloader = DataLoader(dataset, batch_size=1, - shuffle=True, collate_fn=dataset.collate_fn, + shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) for i, data in enumerate(dataloader): @@ -93,11 +94,53 @@ class TestDataset(unittest.TestCase): assert linear_input[0, -1].sum() == 0 assert linear_input[0, -2].sum() != 0 assert stop_target[0, -1] == 1 + assert stop_target[0, -2] == 0 assert stop_target.sum() == 1 assert len(mel_lengths.shape) == 1 - print(mel_lengths) - print(mel_input) assert mel_lengths[0] == mel_input[0].shape[0] + + # Test for batch size 2 + dataloader = DataLoader(dataset, batch_size=2, + shuffle=False, collate_fn=dataset.collate_fn, + drop_last=False, num_workers=c.num_loader_workers) + + for i, data in enumerate(dataloader): + if i == self.max_loader_iter: + break + text_input = data[0] + text_lengths = data[1] + linear_input = data[2] + mel_input = data[3] + mel_lengths = data[4] + stop_target = data[5] + item_idx = data[6] + + if mel_lengths[0] > mel_lengths[1]: + idx = 0 + else: + idx = 1 + + # check the first item in the batch + assert mel_input[idx, -1].sum() == 0 + assert mel_input[idx, -2].sum() != 0, mel_input + assert linear_input[idx, -1].sum() == 0 + assert linear_input[idx, -2].sum() != 0 + assert stop_target[idx, -1] == 1 + assert stop_target[idx, -2] == 0 + assert stop_target[idx].sum() == 1 + assert len(mel_lengths.shape) == 1 + assert mel_lengths[idx] == mel_input[idx].shape[0] + + # check the second itme in the batch + assert mel_input[1-idx, -1].sum() == 0 + assert linear_input[1-idx, -1].sum() == 0 + assert stop_target[1-idx, -1] == 1 + assert len(mel_lengths.shape) == 1 + + # check batch conditions + assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 + assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 + diff --git a/train.py b/train.py index 97876036..626ff105 100644 --- a/train.py +++ b/train.py @@ -98,16 +98,6 @@ def train(model, criterion, data_loader, optimizer, epoch): mel_lengths_var = Variable(mel_lengths) linear_spec_var = Variable(linear_input, volatile=True) - # sort sequence by length for curriculum learning - # TODO: might be unnecessary - sorted_lengths, indices = torch.sort( - text_lengths.view(-1), dim=0, descending=True) - sorted_lengths = sorted_lengths.long().numpy() - text_input_var = text_input_var[indices] - mel_spec_var = mel_spec_var[indices] - mel_lengths_var = mel_lengths_var[indices] - linear_spec_var = linear_spec_var[indices] - # dispatch data to GPU if use_cuda: text_input_var = text_input_var.cuda() diff --git a/utils/data.py b/utils/data.py index 022fab1e..b0bc9588 100644 --- a/utils/data.py +++ b/utils/data.py @@ -1,7 +1,7 @@ import numpy as np -def pad_data(x, length): +def _pad_data(x, length): _pad = 0 assert x.ndim == 1 return np.pad(x, (0, length - x.shape[0]), @@ -11,30 +11,31 @@ def pad_data(x, length): def prepare_data(inputs): max_len = max((len(x) for x in inputs)) - return np.stack([pad_data(x, max_len) for x in inputs]) + return np.stack([_pad_data(x, max_len) for x in inputs]) -def pad_tensor(x, length): +def _pad_tensor(x, length): _pad = 0 assert x.ndim == 2 - return np.pad(x, [[0, 0], [0, length- x.shape[1]]], mode='constant', constant_values=_pad) + return np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad) -def prepare_tensor(inputs): - max_len = max((x.shape[1] for x in inputs)) - return np.stack([pad_tensor(x, max_len) for x in inputs]) +def prepare_tensor(inputs, out_steps): + max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame + remainder = max_len % out_steps + return np.stack([_pad_tensor(x, max_len + remainder) for x in inputs]) -def pad_stop_target(x, length): +def _pad_stop_target(x, length): _pad = 1. assert x.ndim == 1 return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) def prepare_stop_target(inputs, out_steps): - max_len = max((x.shape[0] for x in inputs)) + max_len = max((x.shape[0] for x in inputs)) + 1 # zero-frame remainder = max_len % out_steps - return np.stack([pad_stop_target(x, max_len + out_steps - remainder) for x in inputs]) + return np.stack([_pad_stop_target(x, max_len + remainder) for x in inputs]) def pad_per_step(inputs, pad_len):