From f0bcc390d24730e31656ded6f6df88ae765bc8ac Mon Sep 17 00:00:00 2001
From: Edresson <edresson1@gmail.com>
Date: Fri, 31 Jul 2020 00:05:08 -0300
Subject: [PATCH] Implement Angular Prototypical loss

---
 mozilla_voice_tts/speaker_encoder/loss.py | 42 +++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/mozilla_voice_tts/speaker_encoder/loss.py b/mozilla_voice_tts/speaker_encoder/loss.py
index ab290547..6f83be63 100644
--- a/mozilla_voice_tts/speaker_encoder/loss.py
+++ b/mozilla_voice_tts/speaker_encoder/loss.py
@@ -23,6 +23,8 @@ class GE2ELoss(nn.Module):
         self.b = nn.Parameter(torch.tensor(init_b))
         self.loss_method = loss_method
 
+        print('Initialised Generalized End-to-End loss')
+
         assert self.loss_method in ["softmax", "contrast"]
 
         if self.loss_method == "softmax":
@@ -119,3 +121,43 @@ class GE2ELoss(nn.Module):
         cos_sim_matrix = self.w * cos_sim_matrix + self.b
         L = self.embed_loss(dvecs, cos_sim_matrix)
         return L.mean()
+
+# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
+class AngleProtoLoss(nn.Module):
+    """ 
+    Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
+        Accepts an input of size (N, M, D)
+            where N is the number of speakers in the batch,
+            M is the number of utterances per speaker,
+            and D is the dimensionality of the embedding vector
+        Args:
+            - init_w (float): defines the initial value of w 
+            - init_b (float): definies the initial value of b
+    """
+    def __init__(self, init_w=10.0, init_b=-5.0):
+        super(AngleProtoLoss, self).__init__()
+        # pylint: disable=E1102
+        self.w = nn.Parameter(torch.tensor(init_w))
+        # pylint: disable=E1102
+        self.b = nn.Parameter(torch.tensor(init_b))
+        self.criterion = torch.nn.CrossEntropyLoss()
+        self.use_cuda = torch.cuda.is_available()
+
+        print('Initialised Angular Prototypical loss')
+
+    def forward(self, x):
+        """
+        Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+        out_anchor = torch.mean(x[:,1:,:],1)
+        out_positive = x[:,0,:]
+        num_speakers = out_anchor.size()[0]
+
+        cos_sim_matrix  = F.cosine_similarity(out_positive.unsqueeze(-1).expand(-1,-1,num_speakers),out_anchor.unsqueeze(-1).expand(-1,-1,num_speakers).transpose(0,2))
+        torch.clamp(self.w, 1e-6)
+        cos_sim_matrix = cos_sim_matrix * self.w + self.b
+        label = torch.from_numpy(np.asarray(range(0,num_speakers)))
+        if self.use_cuda:
+            label = label.cuda()
+        L = self.criterion(cos_sim_matrix, label)
+        return L
\ No newline at end of file