diff --git a/layers/common_layers.py b/layers/common_layers.py
index f27ecf56..023c7404 100644
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@@ -132,8 +132,8 @@ class GravesAttention(nn.Module):
         self.init_layers()
 
     def init_layers(self):
-        torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.)
-        torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)
+        torch.nn.init.constant_(self.N_a[2].bias[(2*self.K):(3*self.K)], 1.)  # bias mean
+        torch.nn.init.constant_(self.N_a[2].bias[self.K:(2*self.K)], 10)  # bias std
 
     def init_states(self, inputs):
         if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]:
@@ -167,7 +167,7 @@ class GravesAttention(nn.Module):
         sig_t = torch.nn.functional.softplus(b_t) + self.eps
 
         mu_t = self.mu_prev + torch.nn.functional.softplus(k_t)
-        g_t = torch.softmax(g_t, dim=-1) / sig_t + self.eps
+        g_t = torch.softmax(g_t, dim=-1) + self.eps
 
         j = self.J[:inputs.size(1)+1]