diff --git a/layers/common_layers.py b/layers/common_layers.py index 2155de16..bbc3554f 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -120,7 +120,7 @@ class GravesAttention(nn.Module): self.J = None self.N_a = nn.Sequential( nn.Linear(query_dim, query_dim, bias=True), - nn.Tanh(), + nn.ReLU(), nn.Linear(query_dim, 3*K, bias=True)) self.attention_weights = None self.mu_prev = None @@ -163,7 +163,7 @@ class GravesAttention(nn.Module): sig_t = torch.pow(torch.nn.functional.softplus(b_t), 2) mu_t = self.mu_prev + torch.nn.functional.softplus(k_t) # TODO try sigmoid here - g_t = (torch.softmax(g_t, dim=-1) / sig_t) * self.COEF + g_t = (torch.softmax(g_t, dim=-1) / sig_t) # each B x K x T_in g_t = g_t.unsqueeze(2).expand(g_t.size(0), @@ -175,7 +175,7 @@ class GravesAttention(nn.Module): # attention weights phi_t = g_t * torch.exp(-0.5 * sig_t * (mu_t_ - j)**2) - alpha_t = torch.sum(phi_t, 1) + alpha_t = self.COEF * torch.sum(phi_t, 1) # apply masking if mask is not None: