dropout graves attention heads to decorrelate and prevent overpowering of a single head

2020-03-10 13:53:04 +01:00 · 2020-03-10 13:53:04 +01:00 · 201f04d3b3
parent 975842f71a
commit 201f04d3b3
1 changed files with 3 additions and 0 deletions
--- a/layers/common_layers.py
+++ b/layers/common_layers.py
@ -164,6 +164,9 @@ class GravesAttention(nn.Module):
        b_t = gbk_t[:, 1, :]
        k_t = gbk_t[:, 2, :]

+        # dropout to decorrelate attention heads
+        g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training)
+
        # attention GMM parameters
        sig_t = torch.nn.functional.softplus(b_t) + self.eps