diff --git a/config.json b/config.json
index 75e50849..c958ebf3 100644
--- a/config.json
+++ b/config.json
@@ -34,7 +34,6 @@
 
     "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
 
-<<<<<<< HEAD
     // TRAINING
     "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
     "eval_batch_size":16,   
@@ -48,9 +47,6 @@
     "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
 
     // OPTIMIZER
-=======
-    "model": "Tacotron2",          // one of the model in models/    
->>>>>>> config update and bug fixes
     "grad_clip": 1,                // upper limit for gradients for clipping.
     "epochs": 1000,                // total number of epochs to train.
     "lr": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
@@ -63,12 +59,8 @@
     "prenet_type": "original",     // "original" or "bn".
     "prenet_dropout": true,        // enable/disable dropout at prenet. 
 
-<<<<<<< HEAD
     // ATTENTION
     "attention_type": "original",  // 'original' or 'graves'
-=======
-    "attention_type": "graves",  // 'original' or 'graves'
->>>>>>> config update and bug fixes
     "attention_heads": 5,          // number of attention heads (only for 'graves')
     "attention_norm": "sigmoid",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
     "windowing": false,            // Enables attention windowing. Used only in eval mode.
diff --git a/layers/common_layers.py b/layers/common_layers.py
index 716bcfd0..2155de16 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -119,11 +119,16 @@ class GravesAttention(nn.Module):
         self.epsilon = 1e-5
         self.J = None
         self.N_a = nn.Sequential(
-            nn.Linear(query_dim, query_dim),
+            nn.Linear(query_dim, query_dim, bias=True),
             nn.Tanh(),
-            nn.Linear(query_dim, 3*K))
+            nn.Linear(query_dim, 3*K, bias=True))
         self.attention_weights = None
         self.mu_prev = None
+        self.init_layers()
+
+    def init_layers(self):
+        torch.nn.init.constant_(self.N_a[2].bias[10:15], 0.5)
+        torch.nn.init.constant_(self.N_a[2].bias[5:10], 10)
 
     def init_states(self, inputs):
         if self.J is None or inputs.shape[1] > self.J.shape[-1]:
diff --git a/train.py b/train.py
index 7590ad19..976577fd 100644
--- a/train.py
+++ b/train.py
@@ -198,7 +198,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
 
         loss.backward()
         optimizer, current_lr = adam_weight_decay(optimizer)
-        grad_norm, _ = check_update(model, c.grad_clip)
+        grad_norm, _ = check_update(model.decoder, c.grad_clip)
         optimizer.step()
 
         # compute alignment score