From a9eadd1b8a1454442282ff4b19981ac810e993de Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 3 Apr 2018 03:24:57 -0700 Subject: [PATCH] pep8 check --- datasets/LJSpeech.py | 36 +++++---- debug_config.py | 48 ++++++------ layers/attention.py | 9 +-- layers/custom_layers.py | 6 +- layers/losses.py | 11 +-- layers/tacotron.py | 38 +++------- models/tacotron.py | 18 +---- module.py | 3 +- notebooks/utils.py | 26 ++++--- synthesis.py | 2 +- tests/generic_utils_text.py | 1 + tests/layers_tests.py | 17 +++-- tests/loader_tests.py | 23 +++--- train.py | 143 +++++++++++++++++++----------------- utils/audio.py | 20 +---- utils/data.py | 6 +- utils/generic_utils.py | 7 +- utils/text/__init__.py | 2 +- utils/text/cleaners.py | 2 +- utils/text/cmudict.py | 2 +- utils/text/numbers.py | 2 +- utils/text/symbols.py | 2 +- utils/visual.py | 2 +- 23 files changed, 198 insertions(+), 228 deletions(-) diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index a773c661..898b88bf 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -14,10 +14,10 @@ from TTS.utils.data import (prepare_data, pad_per_step, class LJSpeechDataset(Dataset): def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, - text_cleaner, num_mels, min_level_db, frame_shift_ms, - frame_length_ms, preemphasis, ref_level_db, num_freq, power, - min_seq_len=0): - + text_cleaner, num_mels, min_level_db, frame_shift_ms, + frame_length_ms, preemphasis, ref_level_db, num_freq, power, + min_seq_len=0): + with open(csv_file, "r") as f: self.frames = [line.split('|') for line in f] self.root_dir = root_dir @@ -41,11 +41,11 @@ class LJSpeechDataset(Dataset): def _sort_frames(self): r"""Sort sequences in ascending order""" lengths = np.array([len(ins[1]) for ins in self.frames]) - + print(" | > Max length sequence {}".format(np.max(lengths))) print(" | > Min length sequence {}".format(np.min(lengths))) print(" | > Avg length sequence {}".format(np.mean(lengths))) - + idxs = np.argsort(lengths) new_frames = [] ignored = [] @@ -55,9 +55,10 @@ class LJSpeechDataset(Dataset): ignored.append(idx) else: new_frames.append(self.frames[idx]) - print(" | > {} instances are ignored by min_seq_len ({})".format(len(ignored), self.min_seq_len)) + print(" | > {} instances are ignored by min_seq_len ({})".format( + len(ignored), self.min_seq_len)) self.frames = new_frames - + def __len__(self): return len(self.frames) @@ -65,7 +66,8 @@ class LJSpeechDataset(Dataset): wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav' text = self.frames[idx][1] - text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) + text = np.asarray(text_to_sequence( + text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample @@ -96,13 +98,15 @@ class LJSpeechDataset(Dataset): linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] - mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame - + mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame + # compute 'stop token' targets - stop_targets = [np.array([0.]*(mel_len-1)) for mel_len in mel_lengths] - + stop_targets = [np.array([0.]*(mel_len-1)) + for mel_len in mel_lengths] + # PAD stop targets - stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) + stop_targets = prepare_stop_target( + stop_targets, self.outputs_per_step) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) @@ -112,7 +116,7 @@ class LJSpeechDataset(Dataset): linear = prepare_tensor(linear, self.outputs_per_step) mel = prepare_tensor(mel, self.outputs_per_step) assert mel.shape[2] == linear.shape[2] - timesteps = mel.shape[2] + timesteps = mel.shape[2] # B x T x D linear = linear.transpose(0, 2, 1) @@ -125,7 +129,7 @@ class LJSpeechDataset(Dataset): mel = torch.FloatTensor(mel) mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) - + return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ diff --git a/debug_config.py b/debug_config.py index 3484c4db..7a9a94ab 100644 --- a/debug_config.py +++ b/debug_config.py @@ -1,29 +1,29 @@ { - "num_mels": 80, - "num_freq": 1024, - "sample_rate": 20000, - "frame_length_ms": 50.0, - "frame_shift_ms": 12.5, - "preemphasis": 0.97, - "min_level_db": -100, - "ref_level_db": 20, - "hidden_size": 128, - "embedding_size": 256, - "text_cleaner": "english_cleaners", + "num_mels": 80, + "num_freq": 1024, + "sample_rate": 20000, + "frame_length_ms": 50.0, + "frame_shift_ms": 12.5, + "preemphasis": 0.97, + "min_level_db": -100, + "ref_level_db": 20, + "hidden_size": 128, + "embedding_size": 256, + "text_cleaner": "english_cleaners", - "epochs": 200, - "lr": 0.01, - "lr_patience": 2, - "lr_decay": 0.5, - "batch_size": 32, - "griffinf_lim_iters": 60, - "power": 1.5, - "r": 5, + "epochs": 200, + "lr": 0.01, + "lr_patience": 2, + "lr_decay": 0.5, + "batch_size": 32, + "griffinf_lim_iters": 60, + "power": 1.5, + "r": 5, - "num_loader_workers": 16, + "num_loader_workers": 16, - "save_step":1 , - "data_path": "/data/shared/KeithIto/LJSpeech-1.0", - "output_path": "result", - "log_dir": "/home/erogol/projects/TTS/logs/" + "save_step": 1, + "data_path": "/data/shared/KeithIto/LJSpeech-1.0", + "output_path": "result", + "log_dir": "/home/erogol/projects/TTS/logs/" } diff --git a/layers/attention.py b/layers/attention.py index 1626e949..3bab1ad6 100644 --- a/layers/attention.py +++ b/layers/attention.py @@ -25,7 +25,8 @@ class BahdanauAttention(nn.Module): processed_annots = self.annot_layer(annots) # (batch, max_time, 1) - alignment = self.v(nn.functional.tanh(processed_query + processed_annots)) + alignment = self.v(nn.functional.tanh( + processed_query + processed_annots)) # (batch, max_time) return alignment.squeeze(-1) @@ -57,11 +58,11 @@ class AttentionRNN(nn.Module): if annotations_lengths is not None and mask is None: mask = get_mask_from_lengths(annotations, annotations_lengths) - + # Concat input query and previous context context rnn_input = torch.cat((memory, context), -1) #rnn_input = rnn_input.unsqueeze(1) - + # Feed it to RNN # s_i = f(y_{i-1}, c_{i}, s_{i-1}) rnn_output = self.rnn_cell(rnn_input, rnn_state) @@ -85,5 +86,3 @@ class AttentionRNN(nn.Module): context = torch.bmm(alignment.unsqueeze(1), annotations) context = context.squeeze(1) return rnn_output, context, alignment - - diff --git a/layers/custom_layers.py b/layers/custom_layers.py index d659efb2..c7337e71 100644 --- a/layers/custom_layers.py +++ b/layers/custom_layers.py @@ -11,16 +11,16 @@ from torch import nn # in_features (int): size of the input vector # out_features (int or list): size of each output vector. aka number # of predicted frames. -# """ +# """ # def __init__(self, in_features, out_features): # super(StopProjection, self).__init__() # self.linear = nn.Linear(in_features, out_features) # self.dropout = nn.Dropout(0.5) # self.sigmoid = nn.Sigmoid() - + # def forward(self, inputs): # out = self.dropout(inputs) # out = self.linear(out) # out = self.sigmoid(out) -# return out \ No newline at end of file +# return out diff --git a/layers/losses.py b/layers/losses.py index 0430c762..3e8376a4 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -1,4 +1,4 @@ -import torch +import torch from torch.nn import functional from torch.autograd import Variable from torch import nn @@ -20,10 +20,10 @@ def _sequence_mask(sequence_length, max_len=None): class L1LossMasked(nn.Module): - + def __init__(self): super(L1LossMasked, self).__init__() - + def forward(self, input, target, length): """ Args: @@ -51,7 +51,8 @@ class L1LossMasked(nn.Module): # losses: (batch, max_len, dim) losses = losses_flat.view(*target.size()) # mask: (batch, max_len, 1) - mask = _sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2) + mask = _sequence_mask(sequence_length=length, + max_len=target.size(1)).unsqueeze(2) losses = losses * mask.float() loss = losses.sum() / (length.float().sum() * float(target.shape[2])) - return loss \ No newline at end of file + return loss diff --git a/layers/tacotron.py b/layers/tacotron.py index 916ea677..e567dbf0 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -6,6 +6,7 @@ from torch import nn from .attention import AttentionRNN from .attention import get_mask_from_lengths + class Prenet(nn.Module): r""" Prenet as explained at https://arxiv.org/abs/1703.10135. It creates as many layers as given by 'out_features' @@ -14,7 +15,7 @@ class Prenet(nn.Module): in_features (int): size of the input vector out_features (int or list): size of each output sample. If it is a list, for each value, there is created a new layer. - """ + """ def __init__(self, in_features, out_features=[256, 128]): super(Prenet, self).__init__() @@ -60,7 +61,7 @@ class BatchNormConv1d(nn.Module): self.activation = activation def forward(self, x): - x = self.conv1d(x) + x = self.conv1d(x) if self.activation is not None: x = self.activation(x) return self.bn(x) @@ -116,7 +117,7 @@ class CBHG(nn.Module): self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1) out_features = [K * in_features] + projections[:-1] - activations = [self.relu] * (len(projections) - 1) + activations = [self.relu] * (len(projections) - 1) activations += [None] # setup conv1d projection layers @@ -179,7 +180,7 @@ class CBHG(nn.Module): # (B, T_in, in_features*2) # TODO: replace GRU with convolution as in Deep Voice 3 - self.gru.flatten_parameters() + self.gru.flatten_parameters() outputs, _ = self.gru(x) return outputs @@ -214,6 +215,7 @@ class Decoder(nn.Module): r (int): number of outputs per time step. eps (float): threshold for detecting the end of a sentence. """ + def __init__(self, in_features, memory_dim, r, eps=0.05, mode='train'): super(Decoder, self).__init__() self.mode = mode @@ -251,23 +253,18 @@ class Decoder(nn.Module): - memory: batch x #mels_pecs x mel_spec_dim """ B = inputs.size(0) - # Run greedy decoding if memory is None greedy = not self.training - if memory is not None: - # Grouping multiple frames if necessary if memory.size(-1) == self.memory_dim: memory = memory.view(B, memory.size(1) // self.r, -1) " !! Dimension mismatch {} vs {} * {}".format(memory.size(-1), - self.memory_dim, self.r) + self.memory_dim, self.r) T_decoder = memory.size(1) - # go frame - 0 frames tarting the sequence initial_memory = Variable( inputs.data.new(B, self.memory_dim * self.r).zero_()) - # Init decoder states attention_rnn_hidden = Variable( inputs.data.new(B, 256).zero_()) @@ -276,14 +273,11 @@ class Decoder(nn.Module): for _ in range(len(self.decoder_rnns))] current_context_vec = Variable( inputs.data.new(B, 256).zero_()) - # Time first (T_decoder, B, memory_dim) if memory is not None: memory = memory.transpose(0, 1) - outputs = [] alignments = [] - t = 0 memory_input = initial_memory while True: @@ -291,6 +285,7 @@ class Decoder(nn.Module): if greedy: memory_input = outputs[-1] else: + # TODO: try sampled teacher forcing # combine prev. model output and prev. real target # memory_input = torch.div(outputs[-1] + memory[t-1], 2.0) # add a random noise @@ -298,36 +293,26 @@ class Decoder(nn.Module): # memory_input.data.new(memory_input.size()).normal_(0.0, 0.5)) # memory_input = memory_input + noise memory_input = memory[t-1] - # Prenet processed_memory = self.prenet(memory_input) - # Attention RNN attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn( processed_memory, current_context_vec, attention_rnn_hidden, inputs) - # Concat RNN output and attention context vector decoder_input = self.project_to_decoder_in( torch.cat((attention_rnn_hidden, current_context_vec), -1)) - # Pass through the decoder RNNs for idx in range(len(self.decoder_rnns)): decoder_rnn_hiddens[idx] = self.decoder_rnns[idx]( decoder_input, decoder_rnn_hiddens[idx]) # Residual connectinon decoder_input = decoder_rnn_hiddens[idx] + decoder_input - output = decoder_input - - # predict mel vectors from decoder vectors output = self.proj_to_mel(output) - outputs += [output] alignments += [alignment] - t += 1 - if (not greedy and self.training) or (greedy and memory is not None): if t >= T_decoder: break @@ -338,15 +323,12 @@ class Decoder(nn.Module): print(" !! Decoder stopped with 'max_decoder_steps'. \ Something is probably wrong.") break - assert greedy or len(outputs) == T_decoder - # Back to batch first alignments = torch.stack(alignments).transpose(0, 1) outputs = torch.stack(outputs).transpose(0, 1).contiguous() - return outputs, alignments -def is_end_of_frames(output, eps=0.2): #0.2 - return (output.data <= eps).all() \ No newline at end of file +def is_end_of_frames(output, eps=0.2): # 0.2 + return (output.data <= eps).all() diff --git a/models/tacotron.py b/models/tacotron.py index 05bb1292..f6a78e12 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -9,7 +9,6 @@ from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG class Tacotron(nn.Module): def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80, r=5, padding_idx=None): - super(Tacotron, self).__init__() self.r = r self.mel_dim = mel_dim @@ -17,34 +16,23 @@ class Tacotron(nn.Module): self.embedding = nn.Embedding(len(symbols), embedding_dim, padding_idx=padding_idx) print(" | > Embedding dim : {}".format(len(symbols))) - - # Trying smaller std self.embedding.weight.data.normal_(0, 0.3) self.encoder = Encoder(embedding_dim) self.decoder = Decoder(256, mel_dim, r) - self.postnet = CBHG(mel_dim, K=8, projections=[256, mel_dim]) self.last_linear = nn.Linear(mel_dim * 2, linear_dim) def forward(self, characters, mel_specs=None): - B = characters.size(0) - inputs = self.embedding(characters) - # (B, T', in_dim) + # batch x time x dim encoder_outputs = self.encoder(inputs) - - # (B, T', mel_dim*r) + # batch x time x dim*r mel_outputs, alignments = self.decoder( encoder_outputs, mel_specs) - - # Post net processing below - # Reshape - # (B, T, mel_dim) + # batch x time x dim mel_outputs = mel_outputs.view(B, -1, self.mel_dim) - linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) - return mel_outputs, linear_outputs, alignments diff --git a/module.py b/module.py index 1a19ae46..be29caeb 100644 --- a/module.py +++ b/module.py @@ -288,7 +288,8 @@ class AttentionDecoder(nn.Module): bf_out = gru2_input + gru2_hidden # Output - output = self.out(bf_out).view(-1, self.num_mels, self.outputs_per_step) + output = self.out(bf_out).view(-1, self.num_mels, + self.outputs_per_step) return output, d_t, gru1_hidden, gru2_hidden diff --git a/notebooks/utils.py b/notebooks/utils.py index 6f54fc75..5d19e204 100644 --- a/notebooks/utils.py +++ b/notebooks/utils.py @@ -7,20 +7,23 @@ from matplotlib import pylab as plt hop_length = 250 + def create_speech(m, s, CONFIG, use_cuda, ap): text_cleaner = [CONFIG.text_cleaner] seq = np.array(text_to_sequence(s, text_cleaner)) - + # mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32) - + if use_cuda: - chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda() -# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda() + chars_var = torch.autograd.Variable( + torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda() +# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda() else: - chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0) + chars_var = torch.autograd.Variable( + torch.from_numpy(seq), volatile=True).unsqueeze(0) # mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True) - - mel_out, linear_out, alignments =m.forward(chars_var) + + mel_out, linear_out, alignments = m.forward(chars_var) linear_out = linear_out[0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() spec = ap._denormalize(linear_out) @@ -33,19 +36,18 @@ def create_speech(m, s, CONFIG, use_cuda, ap): def visualize(alignment, spectrogram, CONFIG): label_fontsize = 16 - plt.figure(figsize=(16,16)) + plt.figure(figsize=(16, 16)) - plt.subplot(2,1,1) + plt.subplot(2, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) plt.colorbar() - plt.subplot(2,1,2) - librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate, + plt.subplot(2, 1, 2) + librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate, hop_length=hop_length, x_axis="time", y_axis="linear") plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() - diff --git a/synthesis.py b/synthesis.py index cfabb976..c1ff8290 100644 --- a/synthesis.py +++ b/synthesis.py @@ -1,4 +1,4 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- from network import * from data import inv_spectrogram, find_endpoint, save_wav, spectrogram diff --git a/tests/generic_utils_text.py b/tests/generic_utils_text.py index 0461d263..b1eb2ddc 100644 --- a/tests/generic_utils_text.py +++ b/tests/generic_utils_text.py @@ -6,6 +6,7 @@ from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder OUT_PATH = '/tmp/test.pth.tar' + class ModelSavingTests(unittest.TestCase): def save_checkpoint_test(self): diff --git a/tests/layers_tests.py b/tests/layers_tests.py index 74dab97d..3108caf6 100644 --- a/tests/layers_tests.py +++ b/tests/layers_tests.py @@ -20,7 +20,7 @@ class PrenetTests(unittest.TestCase): class CBHGTests(unittest.TestCase): def test_in_out(self): - layer = CBHG(128, K= 6, projections=[128, 128], num_highways=2) + layer = CBHG(128, K=6, projections=[128, 128], num_highways=2) dummy_input = T.autograd.Variable(T.rand(4, 8, 128)) print(layer) @@ -38,11 +38,11 @@ class DecoderTests(unittest.TestCase): dummy_memory = T.autograd.Variable(T.rand(4, 2, 80)) output, alignment = layer(dummy_input, dummy_memory) - + assert output.shape[0] == 4 assert output.shape[1] == 1, "size not {}".format(output.shape[1]) assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2]) - + class EncoderTests(unittest.TestCase): @@ -56,10 +56,10 @@ class EncoderTests(unittest.TestCase): assert output.shape[0] == 4 assert output.shape[1] == 8 assert output.shape[2] == 256 # 128 * 2 BiRNN - + class L1LossMaskedTests(unittest.TestCase): - + def test_in_out(self): layer = L1LossMasked() dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float()) @@ -69,7 +69,7 @@ class L1LossMaskedTests(unittest.TestCase): assert output.shape[0] == 0 assert len(output.shape) == 1 assert output.data[0] == 0.0 - + dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float()) dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float()) dummy_length = T.autograd.Variable((T.ones(4) * 8).long()) @@ -78,7 +78,8 @@ class L1LossMaskedTests(unittest.TestCase): dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float()) dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float()) - dummy_length = T.autograd.Variable((T.arange(5,9)).long()) - mask = ((_sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + dummy_length = T.autograd.Variable((T.arange(5, 9)).long()) + mask = ((_sequence_mask(dummy_length).float() - 1.0) + * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.data[0] == 1.0, "1.0 vs {}".format(output.data[0]) diff --git a/tests/loader_tests.py b/tests/loader_tests.py index 769fbebe..678b243a 100644 --- a/tests/loader_tests.py +++ b/tests/loader_tests.py @@ -10,6 +10,7 @@ from TTS.datasets.LJSpeech import LJSpeechDataset file_path = os.path.dirname(os.path.realpath(__file__)) c = load_config(os.path.join(file_path, 'test_config.json')) + class TestDataset(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -30,7 +31,7 @@ class TestDataset(unittest.TestCase): c.ref_level_db, c.num_freq, c.power - ) + ) dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=dataset.collate_fn, @@ -46,7 +47,7 @@ class TestDataset(unittest.TestCase): mel_lengths = data[4] stop_target = data[5] item_idx = data[6] - + neg_values = text_input[text_input < 0] check_count = len(neg_values) assert check_count == 0, \ @@ -70,7 +71,7 @@ class TestDataset(unittest.TestCase): c.ref_level_db, c.num_freq, c.power - ) + ) # Test for batch size 1 dataloader = DataLoader(dataset, batch_size=1, @@ -98,8 +99,8 @@ class TestDataset(unittest.TestCase): assert stop_target.sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[0] == mel_input[0].shape[0] - - # Test for batch size 2 + + # Test for batch size 2 dataloader = DataLoader(dataset, batch_size=2, shuffle=False, collate_fn=dataset.collate_fn, drop_last=False, num_workers=c.num_loader_workers) @@ -115,11 +116,11 @@ class TestDataset(unittest.TestCase): stop_target = data[5] item_idx = data[6] - if mel_lengths[0] > mel_lengths[1]: + if mel_lengths[0] > mel_lengths[1]: idx = 0 else: idx = 1 - + # check the first item in the batch assert mel_input[idx, -1].sum() == 0 assert mel_input[idx, -2].sum() != 0, mel_input @@ -130,17 +131,13 @@ class TestDataset(unittest.TestCase): assert stop_target[idx].sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[idx] == mel_input[idx].shape[0] - + # check the second itme in the batch assert mel_input[1-idx, -1].sum() == 0 assert linear_input[1-idx, -1].sum() == 0 assert stop_target[1-idx, -1] == 1 assert len(mel_lengths.shape) == 1 - + # check batch conditions assert (mel_input * stop_target.unsqueeze(2)).sum() == 0 assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 - - - - diff --git a/train.py b/train.py index 87908717..27b00ca5 100644 --- a/train.py +++ b/train.py @@ -35,7 +35,7 @@ parser = argparse.ArgumentParser() parser.add_argument('--restore_path', type=str, help='Folder path to checkpoints', default=0) parser.add_argument('--config_path', type=str, - help='path to config file for training',) + help='path to config file for training',) args = parser.parse_args() # setup output paths and read configs @@ -69,7 +69,7 @@ def train(model, criterion, data_loader, optimizer, epoch): epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 - + print(" | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(data_loader.dataset) / c.batch_size) n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) @@ -82,8 +82,9 @@ def train(model, criterion, data_loader, optimizer, epoch): linear_input = data[2] mel_input = data[3] mel_lengths = data[4] - - current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1 + + current_step = num_iter + args.restore_step + \ + epoch * len(data_loader) + 1 # setup lr current_lr = lr_decay(c.lr, current_step, c.warmup_steps) @@ -108,16 +109,16 @@ def train(model, criterion, data_loader, optimizer, epoch): # forward pass mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_spec_var) - + # loss computation mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var) linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \ - + 0.5 * criterion(linear_output[:, :, :n_priority_freq], - linear_spec_var[: ,: ,:n_priority_freq], - mel_lengths_var) + + 0.5 * criterion(linear_output[:, :, :n_priority_freq], + linear_spec_var[:, :, :n_priority_freq], + mel_lengths_var) loss = mel_loss + linear_loss - # backpass and check the grad norm + # backpass and check the grad norm loss.backward() grad_norm, skip_flag = check_update(model, 0.5, 100) if skip_flag: @@ -129,9 +130,10 @@ def train(model, criterion, data_loader, optimizer, epoch): step_time = time.time() - start_time epoch_time += step_time - # update + # update progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), - ('linear_loss', linear_loss.data[0]), + ('linear_loss', + linear_loss.data[0]), ('mel_loss', mel_loss.data[0]), ('grad_norm', grad_norm)]) @@ -167,7 +169,8 @@ def train(model, criterion, data_loader, optimizer, epoch): # Sample audio audio_signal = linear_output[0].data.cpu().numpy() data_loader.dataset.ap.griffin_lim_iters = 60 - audio_signal = data_loader.dataset.ap.inv_spectrogram(audio_signal.T) + audio_signal = data_loader.dataset.ap.inv_spectrogram( + audio_signal.T) try: tb.add_audio('SampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) @@ -176,30 +179,30 @@ def train(model, criterion, data_loader, optimizer, epoch): # print(audio_signal.max()) # print(audio_signal.min()) pass - - + avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_total_loss = avg_mel_loss + avg_linear_loss - + # Plot Training Epoch Stats tb.add_scalar('TrainEpochLoss/TotalLoss', loss.data[0], current_step) - tb.add_scalar('TrainEpochLoss/LinearLoss', linear_loss.data[0], current_step) + tb.add_scalar('TrainEpochLoss/LinearLoss', + linear_loss.data[0], current_step) tb.add_scalar('TrainEpochLoss/MelLoss', mel_loss.data[0], current_step) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0 return avg_linear_loss, current_step - + def evaluate(model, criterion, data_loader, current_step): model = model.eval() epoch_time = 0 - + print(" | > Validation") n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) progbar = Progbar(len(data_loader.dataset) / c.eval_batch_size) - + avg_linear_loss = 0 avg_mel_loss = 0 @@ -227,24 +230,26 @@ def evaluate(model, criterion, data_loader, current_step): linear_spec_var = linear_spec_var.cuda() # forward pass - mel_output, linear_output, alignments = model.forward(text_input_var, mel_spec_var) - + mel_output, linear_output, alignments = model.forward( + text_input_var, mel_spec_var) + # loss computation mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var) linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \ - + 0.5 * criterion(linear_output[:, :, :n_priority_freq], - linear_spec_var[: ,: ,:n_priority_freq], - mel_lengths_var) - loss = mel_loss + linear_loss + + 0.5 * criterion(linear_output[:, :, :n_priority_freq], + linear_spec_var[:, :, :n_priority_freq], + mel_lengths_var) + loss = mel_loss + linear_loss step_time = time.time() - start_time epoch_time += step_time - # update + # update progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), - ('linear_loss', linear_loss.data[0]), + ('linear_loss', + linear_loss.data[0]), ('mel_loss', mel_loss.data[0])]) - + avg_linear_loss += linear_loss.data[0] avg_mel_loss += mel_loss.data[0] @@ -257,7 +262,7 @@ def evaluate(model, criterion, data_loader, current_step): const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap) gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap) align_img = plot_alignment(align_img) - + tb.add_image('ValVisual/Reconstruction', const_spec, current_step) tb.add_image('ValVisual/GroundTruth', gt_spec, current_step) tb.add_image('ValVisual/ValidationAlignment', align_img, current_step) @@ -274,61 +279,61 @@ def evaluate(model, criterion, data_loader, current_step): # print(audio_signal.max()) # print(audio_signal.min()) pass - + # compute average losses avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_total_loss = avg_mel_loss + avg_linear_loss - + # Plot Learning Stats tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step) tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step) tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step) return avg_linear_loss - - + + def main(args): # Setup the dataset train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'), - os.path.join(c.data_path, 'wavs'), - c.r, - c.sample_rate, - c.text_cleaner, - c.num_mels, - c.min_level_db, - c.frame_shift_ms, - c.frame_length_ms, - c.preemphasis, - c.ref_level_db, - c.num_freq, - c.power, - min_seq_len=c.min_seq_len - ) + os.path.join(c.data_path, 'wavs'), + c.r, + c.sample_rate, + c.text_cleaner, + c.num_mels, + c.min_level_db, + c.frame_shift_ms, + c.frame_length_ms, + c.preemphasis, + c.ref_level_db, + c.num_freq, + c.power, + min_seq_len=c.min_seq_len + ) train_loader = DataLoader(train_dataset, batch_size=c.batch_size, shuffle=False, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=c.num_loader_workers, pin_memory=True) - + val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'), - os.path.join(c.data_path, 'wavs'), - c.r, - c.sample_rate, - c.text_cleaner, - c.num_mels, - c.min_level_db, - c.frame_shift_ms, - c.frame_length_ms, - c.preemphasis, - c.ref_level_db, - c.num_freq, - c.power - ) + os.path.join(c.data_path, 'wavs'), + c.r, + c.sample_rate, + c.text_cleaner, + c.num_mels, + c.min_level_db, + c.frame_shift_ms, + c.frame_length_ms, + c.preemphasis, + c.ref_level_db, + c.num_freq, + c.power + ) val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size, shuffle=False, collate_fn=val_dataset.collate_fn, - drop_last=False, num_workers= 4, + drop_last=False, num_workers=4, pin_memory=True) model = Tacotron(c.embedding_size, @@ -337,11 +342,11 @@ def main(args): c.r) optimizer = optim.Adam(model.parameters(), lr=c.lr) - + if use_cuda: criterion = L1LossMasked().cuda() else: - criterion = L1LossMasked() + criterion = L1LossMasked() if args.restore_path: checkpoint = torch.load(args.restore_path) @@ -361,20 +366,22 @@ def main(args): num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) - + if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) - + if 'best_loss' not in locals(): best_loss = float('inf') - + for epoch in range(0, c.epochs): - train_loss, current_step = train(model, criterion, train_loader, optimizer, epoch) + train_loss, current_step = train( + model, criterion, train_loader, optimizer, epoch) val_loss = evaluate(model, criterion, val_loader, current_step) best_loss = save_best_model(model, optimizer, val_loss, best_loss, OUT_PATH, current_step, epoch) + if __name__ == '__main__': signal.signal(signal.SIGINT, signal_handler) main(args) diff --git a/utils/audio.py b/utils/audio.py index 4099ecc6..4ec58612 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -10,8 +10,8 @@ _mel_basis = None class AudioProcessor(object): def __init__(self, sample_rate, num_mels, min_level_db, frame_shift_ms, - frame_length_ms, preemphasis, ref_level_db, num_freq, power, - griffin_lim_iters=None): + frame_length_ms, preemphasis, ref_level_db, num_freq, power, + griffin_lim_iters=None): self.sample_rate = sample_rate self.num_mels = num_mels self.min_level_db = min_level_db @@ -23,61 +23,49 @@ class AudioProcessor(object): self.power = power self.griffin_lim_iters = griffin_lim_iters - def save_wav(self, wav, path): wav *= 32767 / max(0.01, np.max(np.abs(wav))) librosa.output.write_wav(path, wav.astype(np.int16), self.sample_rate) - def _linear_to_mel(self, spectrogram): global _mel_basis if _mel_basis is None: _mel_basis = self._build_mel_basis() return np.dot(_mel_basis, spectrogram) - def _build_mel_basis(self, ): n_fft = (self.num_freq - 1) * 2 return librosa.filters.mel(self.sample_rate, n_fft, n_mels=self.num_mels) - def _normalize(self, S): return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1) - def _denormalize(self, S): return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db - def _stft_parameters(self, ): n_fft = (self.num_freq - 1) * 2 hop_length = int(self.frame_shift_ms / 1000 * self.sample_rate) win_length = int(self.frame_length_ms / 1000 * self.sample_rate) return n_fft, hop_length, win_length - def _amp_to_db(self, x): return 20 * np.log10(np.maximum(1e-5, x)) - def _db_to_amp(self, x): return np.power(10.0, x * 0.05) - def apply_preemphasis(self, x): return signal.lfilter([1, -self.preemphasis], [1], x) - def apply_inv_preemphasis(self, x): return signal.lfilter([1], [1, -self.preemphasis], x) - def spectrogram(self, y): D = self._stft(self.apply_preemphasis(y)) S = self._amp_to_db(np.abs(D)) - self.ref_level_db return self._normalize(S) - def inv_spectrogram(self, spectrogram): '''Converts spectrogram to waveform using librosa''' S = self._denormalize(spectrogram) @@ -85,7 +73,6 @@ class AudioProcessor(object): # Reconstruct phase return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power)) - def _griffin_lim(self, S): '''librosa implementation of Griffin-Lim Based on https://github.com/librosa/librosa/issues/434 @@ -98,13 +85,11 @@ class AudioProcessor(object): y = self._istft(S_complex * angles) return y - def melspectrogram(self, y): D = self._stft(self.apply_preemphasis(y)) S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db return self._normalize(S) - def _stft(self, y): n_fft, hop_length, win_length = self._stft_parameters() return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) @@ -113,7 +98,6 @@ class AudioProcessor(object): _, hop_length, win_length = self._stft_parameters() return librosa.istft(y, hop_length=hop_length, win_length=win_length) - def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): window_length = int(self.sample_rate * min_silence_sec) hop_length = int(window_length / 4) diff --git a/utils/data.py b/utils/data.py index 6c47d5eb..51d8acb1 100644 --- a/utils/data.py +++ b/utils/data.py @@ -17,11 +17,13 @@ def prepare_data(inputs): def _pad_tensor(x, length): _pad = 0 assert x.ndim == 2 - x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad) + x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], + mode='constant', constant_values=_pad) return x + def prepare_tensor(inputs, out_steps): - max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame + max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame remainder = max_len % out_steps pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len return np.stack([_pad_tensor(x, pad_len) for x in inputs]) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 4832ec44..9940f48b 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -19,7 +19,7 @@ class AttrDict(dict): def load_config(config_path): config = AttrDict() config.update(json.load(open(config_path, "r"))) - return config + return config def create_experiment_folder(root_path): @@ -56,7 +56,7 @@ def _trim_model_state_dict(state_dict): new_state_dict = OrderedDict() for k, v in state_dict.items(): - name = k[7:] # remove `module.` + name = k[7:] # remove `module.` new_state_dict[name] = v return new_state_dict @@ -90,7 +90,8 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, best_loss = model_loss bestmodel_path = 'best_model.pth.tar' bestmodel_path = os.path.join(out_path, bestmodel_path) - print("\n | > Best model saving with loss {0:.2f} : {1:}".format(model_loss, bestmodel_path)) + print("\n | > Best model saving with loss {0:.2f} : {1:}".format( + model_loss, bestmodel_path)) torch.save(state, bestmodel_path) return best_loss diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 78eefac4..9b812c27 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -1,4 +1,4 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import re from TTS.utils.text import cleaners diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py index fe0a46a2..dc62bd75 100644 --- a/utils/text/cleaners.py +++ b/utils/text/cleaners.py @@ -1,4 +1,4 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- ''' diff --git a/utils/text/cmudict.py b/utils/text/cmudict.py index 6673546b..59bd7a73 100644 --- a/utils/text/cmudict.py +++ b/utils/text/cmudict.py @@ -1,4 +1,4 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import re diff --git a/utils/text/numbers.py b/utils/text/numbers.py index 4ce2d389..93f676dc 100644 --- a/utils/text/numbers.py +++ b/utils/text/numbers.py @@ -1,4 +1,4 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- import inflect import re diff --git a/utils/text/symbols.py b/utils/text/symbols.py index 1b4724d7..a1706b23 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -1,4 +1,4 @@ -#-*- coding: utf-8 -*- +# -*- coding: utf-8 -*- ''' diff --git a/utils/visual.py b/utils/visual.py index b0143fc9..0c24a251 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -5,7 +5,7 @@ import matplotlib.pyplot as plt def plot_alignment(alignment, info=None): - fig, ax = plt.subplots(figsize=(16,10)) + fig, ax = plt.subplots(figsize=(16, 10)) im = ax.imshow(alignment.T, aspect='auto', origin='lower', interpolation='none') fig.colorbar(im, ax=ax)