Bug fixes

2018-07-13 15:24:50 +02:00 · 2018-07-13 15:24:50 +02:00 · adbe603af1
parent 0ef3c0ac3f
commit adbe603af1
6 changed files with 13 additions and 10 deletions
--- a/.compute
+++ b/.compute
@ -1,3 +1,3 @@
 #!/bin/bash
 source ../tmp/venv/bin/activate
-python train.py --config_path config.json
+python train.py --config_path config.json --debug true
--- a/config.json
+++ b/config.json
@ -1,11 +1,13 @@
 {
-    "model_name": "best-model",
+    "model_name": "audio-update-l2-loss",
    "num_mels": 80,
    "num_freq": 1025,
    "sample_rate": 20000,
    "frame_length_ms": 50,
    "frame_shift_ms": 12.5,
    "preemphasis": 0.97,
+    "min_mel_freq": 125,
+    "max_mel_freq": 7600,
    "min_level_db": -100,
    "ref_level_db": 20,
    "embedding_size": 256,
@ -25,7 +27,7 @@

    "checkpoint": true,
    "save_step": 376,
-    "print_step": 50,
+    "print_step": 10,
    "data_path": "/snakepit/shared/data/keithito/LJSpeech-1.1/",
    "min_seq_len": 0,
    "output_path": "experiments/"
--- a/layers/attention.py
+++ b/layers/attention.py
@ -38,11 +38,11 @@ class LocationSensitiveAttention(nn.Module):
        super(LocationSensitiveAttention, self).__init__()
        self.kernel_size = kernel_size
        self.filters = filters
-        padding = int((kernel - 1) / 2)
-        self.loc_conv =  nn.Conv1d(2, filters,
+        padding = int((kernel_size - 1) / 2)
+        self.loc_conv =  nn.Conv1d(1, filters,
                                   kernel_size=kernel_size, stride=1,
                                   padding=padding, bias=False)
-        self.loc_linear = nn.Linear(loc_dim, hidden_dim)
+        self.loc_linear = nn.Linear(filters, hidden_dim)
        self.query_layer = nn.Linear(query_dim, hidden_dim, bias=True)
        self.annot_layer = nn.Linear(annot_dim, hidden_dim, bias=True)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
@ -79,7 +79,7 @@ class AttentionRNNCell(nn.Module):
            memory_dim (int): memory vector (decoder autogression) feature dimension.
            align_model (str): 'b' for Bahdanau, 'ls' Location Sensitive alignment.
        """
-        super(AttentionRNN, self).__init__()
+        super(AttentionRNNCell, self).__init__()
        self.align_model = align_model
        self.rnn_cell = nn.GRUCell(out_dim + memory_dim, out_dim)
        # pick bahdanau or location sensitive attention
--- a/layers/tacotron.py
+++ b/layers/tacotron.py
@ -275,7 +275,7 @@ class Decoder(nn.Module):
                                      # dim=1)
            attention_rnn_hidden, current_context_vec, attention = self.attention_rnn(
                processed_memory, current_context_vec, attention_rnn_hidden,
-                inputs, attention, input_lens)
+                inputs, attention.unsqueeze(1), input_lens)
            # attention_cum += attention
            # Concat RNN output and attention context vector
            decoder_input = self.project_to_decoder_in(
--- a/train.py
+++ b/train.py
@ -25,7 +25,7 @@ from utils.generic_utils import (Progbar, remove_experiment_folder,
 from utils.visual import plot_alignment, plot_spectrogram
 from datasets.LJSpeech import LJSpeechDataset
 from models.tacotron import Tacotron
-from layers.losses import L2LossMasked
+from layers.losses import L1LossMasked

 torch.manual_seed(1)
 use_cuda = torch.cuda.is_available()
@ -338,6 +338,7 @@ def evaluate(model, criterion, criterion_st, data_loader, current_step):

 def main(args):

+    # Setup the dataset
    # Setup the dataset
    train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'),
                                    os.path.join(c.data_path, 'wavs'),
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@ -37,7 +37,7 @@ def get_commit_hash():

 def create_experiment_folder(root_path, model_name, debug):
    """ Create a folder with the current date and time """
-    date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I:%M%p")
+    date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
    if debug:
        commit_hash = 'debug'
    else: