diff --git a/.compute b/.compute index f86e0c1b..2fe1e5ea 100644 --- a/.compute +++ b/.compute @@ -1,3 +1,3 @@ #!/bin/bash source ../tmp/venv/bin/activate -python train.py --config_path config.json +python train.py --config_path config.json --debug true diff --git a/config.json b/config.json index 13f00883..b7cc74bd 100644 --- a/config.json +++ b/config.json @@ -1,11 +1,13 @@ { - "model_name": "best-model", + "model_name": "audio-update-l2-loss", "num_mels": 80, "num_freq": 1025, "sample_rate": 20000, "frame_length_ms": 50, "frame_shift_ms": 12.5, "preemphasis": 0.97, + "min_mel_freq": 125, + "max_mel_freq": 7600, "min_level_db": -100, "ref_level_db": 20, "embedding_size": 256, @@ -25,7 +27,7 @@ "checkpoint": true, "save_step": 376, - "print_step": 50, + "print_step": 10, "data_path": "/snakepit/shared/data/keithito/LJSpeech-1.1/", "min_seq_len": 0, "output_path": "experiments/" diff --git a/layers/attention.py b/layers/attention.py index 19d8924e..5c6c3f02 100644 --- a/layers/attention.py +++ b/layers/attention.py @@ -38,11 +38,11 @@ class LocationSensitiveAttention(nn.Module): super(LocationSensitiveAttention, self).__init__() self.kernel_size = kernel_size self.filters = filters - padding = int((kernel - 1) / 2) - self.loc_conv = nn.Conv1d(2, filters, + padding = int((kernel_size - 1) / 2) + self.loc_conv = nn.Conv1d(1, filters, kernel_size=kernel_size, stride=1, padding=padding, bias=False) - self.loc_linear = nn.Linear(loc_dim, hidden_dim) + self.loc_linear = nn.Linear(filters, hidden_dim) self.query_layer = nn.Linear(query_dim, hidden_dim, bias=True) self.annot_layer = nn.Linear(annot_dim, hidden_dim, bias=True) self.v = nn.Linear(hidden_dim, 1, bias=False) @@ -79,7 +79,7 @@ class AttentionRNNCell(nn.Module): memory_dim (int): memory vector (decoder autogression) feature dimension. align_model (str): 'b' for Bahdanau, 'ls' Location Sensitive alignment. """ - super(AttentionRNN, self).__init__() + super(AttentionRNNCell, self).__init__() self.align_model = align_model self.rnn_cell = nn.GRUCell(out_dim + memory_dim, out_dim) # pick bahdanau or location sensitive attention diff --git a/layers/tacotron.py b/layers/tacotron.py index 7f856b33..01fed238 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -275,7 +275,7 @@ class Decoder(nn.Module): # dim=1) attention_rnn_hidden, current_context_vec, attention = self.attention_rnn( processed_memory, current_context_vec, attention_rnn_hidden, - inputs, attention, input_lens) + inputs, attention.unsqueeze(1), input_lens) # attention_cum += attention # Concat RNN output and attention context vector decoder_input = self.project_to_decoder_in( diff --git a/train.py b/train.py index ec339279..2c380f57 100644 --- a/train.py +++ b/train.py @@ -25,7 +25,7 @@ from utils.generic_utils import (Progbar, remove_experiment_folder, from utils.visual import plot_alignment, plot_spectrogram from datasets.LJSpeech import LJSpeechDataset from models.tacotron import Tacotron -from layers.losses import L2LossMasked +from layers.losses import L1LossMasked torch.manual_seed(1) use_cuda = torch.cuda.is_available() @@ -338,6 +338,7 @@ def evaluate(model, criterion, criterion_st, data_loader, current_step): def main(args): + # Setup the dataset # Setup the dataset train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'), os.path.join(c.data_path, 'wavs'), diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 9490581a..a5393617 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -37,7 +37,7 @@ def get_commit_hash(): def create_experiment_folder(root_path, model_name, debug): """ Create a folder with the current date and time """ - date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I:%M%p") + date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") if debug: commit_hash = 'debug' else: