diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json index 2a61ba03..fb6ae136 100644 --- a/TTS/tts/configs/config.json +++ b/TTS/tts/configs/config.json @@ -65,9 +65,14 @@ "eval_batch_size":16, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. + "apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. + + // LOSS SETTINGS + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // postnet loss weight. If > 0, it is enabled + "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled // VALIDATION "run_eval": true, diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 15b0a2ee..d7d45c1b 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -131,6 +131,20 @@ class BCELossMasked(nn.Module): return loss +class DifferentailSpectralLoss(nn.Module): + """Differential Spectral Loss + https://arxiv.org/ftp/arxiv/papers/1909/1909.10302.pdf""" + + def __init__(self, loss_func): + super().__init__() + self.loss_func = loss_func + + def forward(self, x, target, length): + x_diff = x[:, 1:] - x[:, :-1] + target_diff = target[:, 1:] - target[:, :-1] + return self.loss_func(x_diff, target_diff, length-1) + + class GuidedAttentionLoss(torch.nn.Module): def __init__(self, sigma=0.4): super(GuidedAttentionLoss, self).__init__() @@ -172,8 +186,12 @@ class TacotronLoss(torch.nn.Module): super(TacotronLoss, self).__init__() self.stopnet_pos_weight = stopnet_pos_weight self.ga_alpha = c.ga_alpha + self.diff_spec_alpha = c.diff_spec_alpha + self.decoder_alpha = c.decoder_loss_alpha + self.postnet_alpha = c.postnet_loss_alpha self.config = c - # postnet decoder loss + + # postnet and decoder loss if c.loss_masking: self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [ "Tacotron" @@ -181,6 +199,9 @@ class TacotronLoss(torch.nn.Module): else: self.criterion = nn.L1Loss() if c.model in ["Tacotron" ] else nn.MSELoss() + # differential spectral loss + if c.diff_spec_loss_alpha > 0: + self.criterion_diff_spec = DifferentailSpectralLoss(loss_func=self.criterion) # guided attention loss if c.ga_alpha > 0: self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma) @@ -196,21 +217,25 @@ class TacotronLoss(torch.nn.Module): return_dict = {} # decoder and postnet losses if self.config.loss_masking: - decoder_loss = self.criterion(decoder_output, mel_input, - output_lens) - if self.config.model in ["Tacotron", "TacotronGST"]: - postnet_loss = self.criterion(postnet_output, linear_input, - output_lens) - else: - postnet_loss = self.criterion(postnet_output, mel_input, + if self.decoder_alpha > 0: + decoder_loss = self.criterion(decoder_output, mel_input, output_lens) + if postnet_alpha > 0: + if self.config.model in ["Tacotron", "TacotronGST"]: + postnet_loss = self.criterion(postnet_output, linear_input, + output_lens) + else: + postnet_loss = self.criterion(postnet_output, mel_input, + output_lens) else: - decoder_loss = self.criterion(decoder_output, mel_input) - if self.config.model in ["Tacotron", "TacotronGST"]: - postnet_loss = self.criterion(postnet_output, linear_input) - else: - postnet_loss = self.criterion(postnet_output, mel_input) - loss = decoder_loss + postnet_loss + if self.decoder_alpha > 0: + decoder_loss = self.criterion(decoder_output, mel_input) + if self.postnet_alpha > 0: + if self.config.model in ["Tacotron", "TacotronGST"]: + postnet_loss = self.criterion(postnet_output, linear_input) + else: + postnet_loss = self.criterion(postnet_output, mel_input) + loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss return_dict['decoder_loss'] = decoder_loss return_dict['postnet_loss'] = postnet_loss @@ -254,6 +279,11 @@ class TacotronLoss(torch.nn.Module): loss += ga_loss * self.ga_alpha return_dict['ga_loss'] = ga_loss * self.ga_alpha + # differential spectral loss + if self.config.diff_spec_loss_alpha > 0: + diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens) + loss += diff_spec_loss * self.diff_spec_alpha + return_dict['diff_spec_loss'] = diff_spec_loss return_dict['loss'] = loss return return_dict