diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py index acfc55e5..b06dd8a5 100644 --- a/TTS/tts/layers/glow_tts/glow.py +++ b/TTS/tts/layers/glow_tts/glow.py @@ -69,7 +69,7 @@ class WN(torch.nn.Module): num_layers, c_in_channels=0, dropout_p=0): - super(WN, self).__init__() + super().__init__() assert kernel_size % 2 == 1 assert hidden_channels % 2 == 0 self.in_channels = in_channels @@ -148,70 +148,6 @@ class WN(torch.nn.Module): for l in self.res_skip_layers: torch.nn.utils.remove_weight_norm(l) - -class ActNorm(nn.Module): - """Activation Normalization bijector as an alternative to Batch Norm. It computes - mean and std from a sample data in advance and it uses these values - for normalization at training. - - Args: - channels (int): input channels. - ddi (False): data depended initialization flag. - - Shapes: - - inputs: (B, C, T) - - outputs: (B, C, T) - """ - - def __init__(self, channels, ddi=False, **kwargs): # pylint: disable=unused-argument - super().__init__() - self.channels = channels - self.initialized = not ddi - - self.logs = nn.Parameter(torch.zeros(1, channels, 1)) - self.bias = nn.Parameter(torch.zeros(1, channels, 1)) - - def forward(self, x, x_mask=None, reverse=False, **kwargs): # pylint: disable=unused-argument - if x_mask is None: - x_mask = torch.ones(x.size(0), 1, x.size(2)).to(device=x.device, - dtype=x.dtype) - x_len = torch.sum(x_mask, [1, 2]) - if not self.initialized: - self.initialize(x, x_mask) - self.initialized = True - - if reverse: - z = (x - self.bias) * torch.exp(-self.logs) * x_mask - logdet = None - else: - z = (self.bias + torch.exp(self.logs) * x) * x_mask - logdet = torch.sum(self.logs) * x_len # [b] - - return z, logdet - - def store_inverse(self): - pass - - def set_ddi(self, ddi): - self.initialized = not ddi - - def initialize(self, x, x_mask): - with torch.no_grad(): - denom = torch.sum(x_mask, [0, 2]) - m = torch.sum(x * x_mask, [0, 2]) / denom - m_sq = torch.sum(x * x * x_mask, [0, 2]) / denom - v = m_sq - (m**2) - logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6)) - - bias_init = (-m * torch.exp(-logs)).view(*self.bias.shape).to( - dtype=self.bias.dtype) - logs_init = (-logs).view(*self.logs.shape).to( - dtype=self.logs.dtype) - - self.bias.data.copy_(bias_init) - self.logs.data.copy_(logs_init) - - class InvConvNear(nn.Module): def __init__(self, channels, num_splits=4, no_jacobian=False, **kwargs): # pylint: disable=unused-argument super().__init__() diff --git a/TTS/tts/layers/glow_tts/normalization.py b/TTS/tts/layers/glow_tts/normalization.py index 0930f48c..5ccdeb47 100644 --- a/TTS/tts/layers/glow_tts/normalization.py +++ b/TTS/tts/layers/glow_tts/normalization.py @@ -36,11 +36,10 @@ class TemporalBatchNorm1d(nn.BatchNorm1d): affine=True, track_running_stats=True, momentum=0.1): - super(TemporalBatchNorm1d, - self).__init__(channels, - affine=affine, - track_running_stats=track_running_stats, - momentum=momentum) + super().__init__(channels, + affine=affine, + track_running_stats=track_running_stats, + momentum=momentum) def forward(self, x): return super().forward(x.transpose(2, 1)).transpose(2, 1) diff --git a/TTS/tts/layers/glow_tts/time_depth_sep_conv.py b/TTS/tts/layers/glow_tts/time_depth_sep_conv.py index 732e7d96..c9a117c8 100644 --- a/TTS/tts/layers/glow_tts/time_depth_sep_conv.py +++ b/TTS/tts/layers/glow_tts/time_depth_sep_conv.py @@ -11,7 +11,7 @@ class TimeDepthSeparableConv(nn.Module): out_channels, kernel_size, bias=True): - super(TimeDepthSeparableConv, self).__init__() + super().__init__() self.in_channels = in_channels self.out_channels = out_channels @@ -69,7 +69,7 @@ class TimeDepthSeparableConvBlock(nn.Module): num_layers, kernel_size, bias=True): - super(TimeDepthSeparableConvBlock, self).__init__() + super().__init__() assert (kernel_size - 1) % 2 == 0 assert num_layers > 1 diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 4bc31d90..bf03671c 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -7,9 +7,8 @@ from TTS.tts.utils.generic_utils import sequence_mask class L1LossMasked(nn.Module): - def __init__(self, seq_len_norm): - super(L1LossMasked, self).__init__() + super().__init__() self.seq_len_norm = seq_len_norm def forward(self, x, target, length): @@ -28,25 +27,24 @@ class L1LossMasked(nn.Module): """ # mask: (batch, max_len, 1) target.requires_grad = False - mask = sequence_mask( - sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() + mask = sequence_mask(sequence_length=length, + max_len=target.size(1)).unsqueeze(2).float() if self.seq_len_norm: norm_w = mask / mask.sum(dim=1, keepdim=True) out_weights = norm_w.div(target.shape[0] * target.shape[2]) mask = mask.expand_as(x) - loss = functional.l1_loss( - x * mask, target * mask, reduction='none') + loss = functional.l1_loss(x * mask, + target * mask, + reduction='none') loss = loss.mul(out_weights.to(loss.device)).sum() else: mask = mask.expand_as(x) - loss = functional.l1_loss( - x * mask, target * mask, reduction='sum') + loss = functional.l1_loss(x * mask, target * mask, reduction='sum') loss = loss / mask.sum() return loss class MSELossMasked(nn.Module): - def __init__(self, seq_len_norm): super(MSELossMasked, self).__init__() self.seq_len_norm = seq_len_norm @@ -67,19 +65,21 @@ class MSELossMasked(nn.Module): """ # mask: (batch, max_len, 1) target.requires_grad = False - mask = sequence_mask( - sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() + mask = sequence_mask(sequence_length=length, + max_len=target.size(1)).unsqueeze(2).float() if self.seq_len_norm: norm_w = mask / mask.sum(dim=1, keepdim=True) out_weights = norm_w.div(target.shape[0] * target.shape[2]) mask = mask.expand_as(x) - loss = functional.mse_loss( - x * mask, target * mask, reduction='none') + loss = functional.mse_loss(x * mask, + target * mask, + reduction='none') loss = loss.mul(out_weights.to(loss.device)).sum() else: mask = mask.expand_as(x) - loss = functional.mse_loss( - x * mask, target * mask, reduction='sum') + loss = functional.mse_loss(x * mask, + target * mask, + reduction='sum') loss = loss / mask.sum() return loss @@ -100,7 +100,6 @@ class AttentionEntropyLoss(nn.Module): class BCELossMasked(nn.Module): - def __init__(self, pos_weight): super(BCELossMasked, self).__init__() self.pos_weight = pos_weight @@ -121,9 +120,13 @@ class BCELossMasked(nn.Module): """ # mask: (batch, max_len, 1) target.requires_grad = False - mask = sequence_mask(sequence_length=length, max_len=target.size(1)).float() + mask = sequence_mask(sequence_length=length, + max_len=target.size(1)).float() loss = functional.binary_cross_entropy_with_logits( - x * mask, target * mask, pos_weight=self.pos_weight, reduction='sum') + x * mask, + target * mask, + pos_weight=self.pos_weight, + reduction='sum') loss = loss / mask.sum() return loss @@ -139,7 +142,8 @@ class GuidedAttentionLoss(torch.nn.Module): max_olen = max(olens) ga_masks = torch.zeros((B, max_olen, max_ilen)) for idx, (ilen, olen) in enumerate(zip(ilens, olens)): - ga_masks[idx, :olen, :ilen] = self._make_ga_mask(ilen, olen, self.sigma) + ga_masks[idx, :olen, :ilen] = self._make_ga_mask( + ilen, olen, self.sigma) return ga_masks def forward(self, att_ws, ilens, olens): @@ -153,7 +157,8 @@ class GuidedAttentionLoss(torch.nn.Module): def _make_ga_mask(ilen, olen, sigma): grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen)) grid_x, grid_y = grid_x.float(), grid_y.float() - return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2))) + return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen)**2 / + (2 * (sigma**2))) @staticmethod def _make_masks(ilens, olens): @@ -181,7 +186,8 @@ class TacotronLoss(torch.nn.Module): self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma) # stopnet loss # pylint: disable=not-callable - self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None + self.criterion_st = BCELossMasked( + pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None def forward(self, postnet_output, decoder_output, mel_input, linear_input, stopnet_output, stopnet_target, output_lens, decoder_b_output, @@ -219,19 +225,25 @@ class TacotronLoss(torch.nn.Module): # backward decoder loss (if enabled) if self.config.bidirectional_decoder: if self.config.loss_masking: - decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input, output_lens) + decoder_b_loss = self.criterion( + torch.flip(decoder_b_output, dims=(1, )), mel_input, + output_lens) else: - decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input) - decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_b_output, dims=(1, )), decoder_output) + decoder_b_loss = self.criterion( + torch.flip(decoder_b_output, dims=(1, )), mel_input) + decoder_c_loss = torch.nn.functional.l1_loss( + torch.flip(decoder_b_output, dims=(1, )), decoder_output) loss += decoder_b_loss + decoder_c_loss return_dict['decoder_b_loss'] = decoder_b_loss return_dict['decoder_c_loss'] = decoder_c_loss # double decoder consistency loss (if enabled) if self.config.double_decoder_consistency: - decoder_b_loss = self.criterion(decoder_b_output, mel_input, output_lens) + decoder_b_loss = self.criterion(decoder_b_output, mel_input, + output_lens) # decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output) - attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards) + attention_c_loss = torch.nn.functional.l1_loss( + alignments, alignments_backwards) loss += decoder_b_loss + attention_c_loss return_dict['decoder_coarse_loss'] = decoder_b_loss return_dict['decoder_ddc_loss'] = attention_c_loss @@ -248,7 +260,7 @@ class TacotronLoss(torch.nn.Module): class GlowTTSLoss(torch.nn.Module): def __init__(self): - super(GlowTTSLoss, self).__init__() + super().__init__() self.constant_factor = 0.5 * math.log(2 * math.pi) def forward(self, z, means, scales, log_det, y_lengths, o_dur_log, diff --git a/TTS/vocoder/models/fullband_melgan_generator.py b/TTS/vocoder/models/fullband_melgan_generator.py index 9f90ee17..52dcc75e 100644 --- a/TTS/vocoder/models/fullband_melgan_generator.py +++ b/TTS/vocoder/models/fullband_melgan_generator.py @@ -12,14 +12,13 @@ class FullbandMelganGenerator(MelganGenerator): upsample_factors=(2, 8, 2, 2), res_kernel=3, num_res_blocks=4): - super(FullbandMelganGenerator, - self).__init__(in_channels=in_channels, - out_channels=out_channels, - proj_kernel=proj_kernel, - base_channels=base_channels, - upsample_factors=upsample_factors, - res_kernel=res_kernel, - num_res_blocks=num_res_blocks) + super().__init__(in_channels=in_channels, + out_channels=out_channels, + proj_kernel=proj_kernel, + base_channels=base_channels, + upsample_factors=upsample_factors, + res_kernel=res_kernel, + num_res_blocks=num_res_blocks) @torch.no_grad() def inference(self, cond_features):