differential spectral loss and loss weight settings

2020-09-22 15:15:08 +02:00 · 2020-09-22 15:15:08 +02:00 · bb9b70ee27
parent e1eab1ce4b
commit bb9b70ee27
2 changed files with 52 additions and 17 deletions
--- a/TTS/tts/configs/config.json
+++ b/TTS/tts/configs/config.json
@ -65,9 +65,14 @@
    "eval_batch_size":16,
    "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
-    "loss_masking": true,         // enable / disable loss masking against the sequence padding.
-    "ga_alpha": 10.0,        // weight for guided attention loss. If > 0, guided attention is enabled.
-    "apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
+    "apex_amp_level": null,     // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
+
+    // LOSS SETTINGS
+    "loss_masking": true,       // enable / disable loss masking against the sequence padding.
+    "decoder_loss_alpha": 0.5,  // decoder loss weight. If > 0, it is enabled
+    "postnet_loss_alpha": 0.25, // postnet loss weight. If > 0, it is enabled
+    "ga_alpha": 5.0,           // weight for guided attention loss. If > 0, guided attention is enabled.
+    "diff_spec_alpha": 0.25,     // differential spectral loss weight. If > 0, it is enabled

    // VALIDATION
    "run_eval": true,
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@ -131,6 +131,20 @@ class BCELossMasked(nn.Module):
        return loss


+class DifferentailSpectralLoss(nn.Module):
+    """Differential Spectral Loss
+        https://arxiv.org/ftp/arxiv/papers/1909/1909.10302.pdf"""
+
+    def __init__(self, loss_func):
+        super().__init__()
+        self.loss_func = loss_func
+
+    def forward(self, x, target, length):
+        x_diff = x[:, 1:] - x[:, :-1]
+        target_diff = target[:, 1:] - target[:, :-1]
+        return self.loss_func(x_diff, target_diff, length-1)
+
+
 class GuidedAttentionLoss(torch.nn.Module):
    def __init__(self, sigma=0.4):
        super(GuidedAttentionLoss, self).__init__()
@ -172,8 +186,12 @@ class TacotronLoss(torch.nn.Module):
        super(TacotronLoss, self).__init__()
        self.stopnet_pos_weight = stopnet_pos_weight
        self.ga_alpha = c.ga_alpha
+        self.diff_spec_alpha = c.diff_spec_alpha
+        self.decoder_alpha = c.decoder_loss_alpha
+        self.postnet_alpha = c.postnet_loss_alpha
        self.config = c
-        # postnet decoder loss
+
+        # postnet and decoder loss
        if c.loss_masking:
            self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [
                "Tacotron"
@ -181,6 +199,9 @@ class TacotronLoss(torch.nn.Module):
        else:
            self.criterion = nn.L1Loss() if c.model in ["Tacotron"
                                                        ] else nn.MSELoss()
+        # differential spectral loss
+        if c.diff_spec_loss_alpha > 0:
+            self.criterion_diff_spec = DifferentailSpectralLoss(loss_func=self.criterion)
        # guided attention loss
        if c.ga_alpha > 0:
            self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
@ -196,21 +217,25 @@ class TacotronLoss(torch.nn.Module):
        return_dict = {}
        # decoder and postnet losses
        if self.config.loss_masking:
-            decoder_loss = self.criterion(decoder_output, mel_input,
-                                          output_lens)
-            if self.config.model in ["Tacotron", "TacotronGST"]:
-                postnet_loss = self.criterion(postnet_output, linear_input,
-                                              output_lens)
-            else:
-                postnet_loss = self.criterion(postnet_output, mel_input,
+            if self.decoder_alpha > 0:
+                decoder_loss = self.criterion(decoder_output, mel_input,
                                              output_lens)
+            if postnet_alpha > 0:
+                if self.config.model in ["Tacotron", "TacotronGST"]:
+                    postnet_loss = self.criterion(postnet_output, linear_input,
+                                                output_lens)
+                else:
+                    postnet_loss = self.criterion(postnet_output, mel_input,
+                                                output_lens)
        else:
-            decoder_loss = self.criterion(decoder_output, mel_input)
-            if self.config.model in ["Tacotron", "TacotronGST"]:
-                postnet_loss = self.criterion(postnet_output, linear_input)
-            else:
-                postnet_loss = self.criterion(postnet_output, mel_input)
-        loss = decoder_loss + postnet_loss
+            if self.decoder_alpha > 0:
+                decoder_loss = self.criterion(decoder_output, mel_input)
+            if self.postnet_alpha > 0:
+                if self.config.model in ["Tacotron", "TacotronGST"]:
+                    postnet_loss = self.criterion(postnet_output, linear_input)
+                else:
+                    postnet_loss = self.criterion(postnet_output, mel_input)
+        loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss
        return_dict['decoder_loss'] = decoder_loss
        return_dict['postnet_loss'] = postnet_loss

@ -254,6 +279,11 @@ class TacotronLoss(torch.nn.Module):
            loss += ga_loss * self.ga_alpha
            return_dict['ga_loss'] = ga_loss * self.ga_alpha

+        # differential spectral loss
+        if self.config.diff_spec_loss_alpha > 0:
+            diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens)
+            loss += diff_spec_loss * self.diff_spec_alpha
+            return_dict['diff_spec_loss'] = diff_spec_loss
        return_dict['loss'] = loss
        return return_dict