mirror of https://github.com/coqui-ai/TTS.git
differential spectral loss and loss weight settings
This commit is contained in:
parent
e1eab1ce4b
commit
bb9b70ee27
|
@ -65,9 +65,14 @@
|
||||||
"eval_batch_size":16,
|
"eval_batch_size":16,
|
||||||
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
"r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||||
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
|
"gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
|
||||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
"apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
|
||||||
"ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
|
||||||
"apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate.
|
// LOSS SETTINGS
|
||||||
|
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||||
|
"decoder_loss_alpha": 0.5, // decoder loss weight. If > 0, it is enabled
|
||||||
|
"postnet_loss_alpha": 0.25, // postnet loss weight. If > 0, it is enabled
|
||||||
|
"ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled.
|
||||||
|
"diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled
|
||||||
|
|
||||||
// VALIDATION
|
// VALIDATION
|
||||||
"run_eval": true,
|
"run_eval": true,
|
||||||
|
|
|
@ -131,6 +131,20 @@ class BCELossMasked(nn.Module):
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
|
class DifferentailSpectralLoss(nn.Module):
|
||||||
|
"""Differential Spectral Loss
|
||||||
|
https://arxiv.org/ftp/arxiv/papers/1909/1909.10302.pdf"""
|
||||||
|
|
||||||
|
def __init__(self, loss_func):
|
||||||
|
super().__init__()
|
||||||
|
self.loss_func = loss_func
|
||||||
|
|
||||||
|
def forward(self, x, target, length):
|
||||||
|
x_diff = x[:, 1:] - x[:, :-1]
|
||||||
|
target_diff = target[:, 1:] - target[:, :-1]
|
||||||
|
return self.loss_func(x_diff, target_diff, length-1)
|
||||||
|
|
||||||
|
|
||||||
class GuidedAttentionLoss(torch.nn.Module):
|
class GuidedAttentionLoss(torch.nn.Module):
|
||||||
def __init__(self, sigma=0.4):
|
def __init__(self, sigma=0.4):
|
||||||
super(GuidedAttentionLoss, self).__init__()
|
super(GuidedAttentionLoss, self).__init__()
|
||||||
|
@ -172,8 +186,12 @@ class TacotronLoss(torch.nn.Module):
|
||||||
super(TacotronLoss, self).__init__()
|
super(TacotronLoss, self).__init__()
|
||||||
self.stopnet_pos_weight = stopnet_pos_weight
|
self.stopnet_pos_weight = stopnet_pos_weight
|
||||||
self.ga_alpha = c.ga_alpha
|
self.ga_alpha = c.ga_alpha
|
||||||
|
self.diff_spec_alpha = c.diff_spec_alpha
|
||||||
|
self.decoder_alpha = c.decoder_loss_alpha
|
||||||
|
self.postnet_alpha = c.postnet_loss_alpha
|
||||||
self.config = c
|
self.config = c
|
||||||
# postnet decoder loss
|
|
||||||
|
# postnet and decoder loss
|
||||||
if c.loss_masking:
|
if c.loss_masking:
|
||||||
self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [
|
self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [
|
||||||
"Tacotron"
|
"Tacotron"
|
||||||
|
@ -181,6 +199,9 @@ class TacotronLoss(torch.nn.Module):
|
||||||
else:
|
else:
|
||||||
self.criterion = nn.L1Loss() if c.model in ["Tacotron"
|
self.criterion = nn.L1Loss() if c.model in ["Tacotron"
|
||||||
] else nn.MSELoss()
|
] else nn.MSELoss()
|
||||||
|
# differential spectral loss
|
||||||
|
if c.diff_spec_loss_alpha > 0:
|
||||||
|
self.criterion_diff_spec = DifferentailSpectralLoss(loss_func=self.criterion)
|
||||||
# guided attention loss
|
# guided attention loss
|
||||||
if c.ga_alpha > 0:
|
if c.ga_alpha > 0:
|
||||||
self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
|
self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma)
|
||||||
|
@ -196,21 +217,25 @@ class TacotronLoss(torch.nn.Module):
|
||||||
return_dict = {}
|
return_dict = {}
|
||||||
# decoder and postnet losses
|
# decoder and postnet losses
|
||||||
if self.config.loss_masking:
|
if self.config.loss_masking:
|
||||||
decoder_loss = self.criterion(decoder_output, mel_input,
|
if self.decoder_alpha > 0:
|
||||||
output_lens)
|
decoder_loss = self.criterion(decoder_output, mel_input,
|
||||||
if self.config.model in ["Tacotron", "TacotronGST"]:
|
|
||||||
postnet_loss = self.criterion(postnet_output, linear_input,
|
|
||||||
output_lens)
|
|
||||||
else:
|
|
||||||
postnet_loss = self.criterion(postnet_output, mel_input,
|
|
||||||
output_lens)
|
output_lens)
|
||||||
|
if postnet_alpha > 0:
|
||||||
|
if self.config.model in ["Tacotron", "TacotronGST"]:
|
||||||
|
postnet_loss = self.criterion(postnet_output, linear_input,
|
||||||
|
output_lens)
|
||||||
|
else:
|
||||||
|
postnet_loss = self.criterion(postnet_output, mel_input,
|
||||||
|
output_lens)
|
||||||
else:
|
else:
|
||||||
decoder_loss = self.criterion(decoder_output, mel_input)
|
if self.decoder_alpha > 0:
|
||||||
if self.config.model in ["Tacotron", "TacotronGST"]:
|
decoder_loss = self.criterion(decoder_output, mel_input)
|
||||||
postnet_loss = self.criterion(postnet_output, linear_input)
|
if self.postnet_alpha > 0:
|
||||||
else:
|
if self.config.model in ["Tacotron", "TacotronGST"]:
|
||||||
postnet_loss = self.criterion(postnet_output, mel_input)
|
postnet_loss = self.criterion(postnet_output, linear_input)
|
||||||
loss = decoder_loss + postnet_loss
|
else:
|
||||||
|
postnet_loss = self.criterion(postnet_output, mel_input)
|
||||||
|
loss = self.decoder_alpha * decoder_loss + self.postnet_alpha * postnet_loss
|
||||||
return_dict['decoder_loss'] = decoder_loss
|
return_dict['decoder_loss'] = decoder_loss
|
||||||
return_dict['postnet_loss'] = postnet_loss
|
return_dict['postnet_loss'] = postnet_loss
|
||||||
|
|
||||||
|
@ -254,6 +279,11 @@ class TacotronLoss(torch.nn.Module):
|
||||||
loss += ga_loss * self.ga_alpha
|
loss += ga_loss * self.ga_alpha
|
||||||
return_dict['ga_loss'] = ga_loss * self.ga_alpha
|
return_dict['ga_loss'] = ga_loss * self.ga_alpha
|
||||||
|
|
||||||
|
# differential spectral loss
|
||||||
|
if self.config.diff_spec_loss_alpha > 0:
|
||||||
|
diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens)
|
||||||
|
loss += diff_spec_loss * self.diff_spec_alpha
|
||||||
|
return_dict['diff_spec_loss'] = diff_spec_loss
|
||||||
return_dict['loss'] = loss
|
return_dict['loss'] = loss
|
||||||
return return_dict
|
return return_dict
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue