diff --git a/config.json b/config.json index 75e50849..c958ebf3 100644 --- a/config.json +++ b/config.json @@ -34,7 +34,6 @@ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. -<<<<<<< HEAD // TRAINING "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":16, @@ -48,9 +47,6 @@ "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. // OPTIMIZER -======= - "model": "Tacotron2", // one of the model in models/ ->>>>>>> config update and bug fixes "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. @@ -63,12 +59,8 @@ "prenet_type": "original", // "original" or "bn". "prenet_dropout": true, // enable/disable dropout at prenet. -<<<<<<< HEAD // ATTENTION "attention_type": "original", // 'original' or 'graves' -======= - "attention_type": "graves", // 'original' or 'graves' ->>>>>>> config update and bug fixes "attention_heads": 5, // number of attention heads (only for 'graves') "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "windowing": false, // Enables attention windowing. Used only in eval mode. diff --git a/layers/common_layers.py b/layers/common_layers.py index 716bcfd0..2155de16 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -119,11 +119,16 @@ class GravesAttention(nn.Module): self.epsilon = 1e-5 self.J = None self.N_a = nn.Sequential( - nn.Linear(query_dim, query_dim), + nn.Linear(query_dim, query_dim, bias=True), nn.Tanh(), - nn.Linear(query_dim, 3*K)) + nn.Linear(query_dim, 3*K, bias=True)) self.attention_weights = None self.mu_prev = None + self.init_layers() + + def init_layers(self): + torch.nn.init.constant_(self.N_a[2].bias[10:15], 0.5) + torch.nn.init.constant_(self.N_a[2].bias[5:10], 10) def init_states(self, inputs): if self.J is None or inputs.shape[1] > self.J.shape[-1]: diff --git a/train.py b/train.py index 7590ad19..976577fd 100644 --- a/train.py +++ b/train.py @@ -198,7 +198,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, loss.backward() optimizer, current_lr = adam_weight_decay(optimizer) - grad_norm, _ = check_update(model, c.grad_clip) + grad_norm, _ = check_update(model.decoder, c.grad_clip) optimizer.step() # compute alignment score