diff --git a/TTS/tts/layers/generic/classifier.py b/TTS/tts/layers/generic/classifier.py index b09ccdc6..1cd60006 100644 --- a/TTS/tts/layers/generic/classifier.py +++ b/TTS/tts/layers/generic/classifier.py @@ -60,4 +60,3 @@ class ReversalClassifier(nn.Module): target = labels.repeat(input_mask.size(-1), 1).transpose(0, 1).int().long() target[~input_mask] = ignore_index return nn.functional.cross_entropy(predictions.transpose(1, 2), target, ignore_index=ignore_index) - diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 118286a7..89a11139 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -761,20 +761,14 @@ class VitsGeneratorLoss(nn.Module): loss += kl_vae_loss return_dict["loss_kl_vae"] = kl_vae_loss - + if end2end_info is not None: - # do not compute feature loss because for it we need waves segments with the same length - '''loss_feat_end2end = ( - self.feature_loss(feats_real=end2end_info["feats_disc_real"], feats_generated=end2end_info["feats_disc_fake"]) * self.feat_loss_alpha - ) - return_dict["loss_feat_end2end"] = loss_feat_end2end - loss += loss_feat_end2end''' # gen loss loss_gen_end2end = self.generator_loss(scores_fake=end2end_info["scores_disc_fake"])[0] * self.gen_loss_alpha return_dict["loss_gen_end2end"] = loss_gen_end2end loss += loss_gen_end2end - + # if do not uses soft dtw if end2end_info["z_predicted"] is not None: # loss KL using GT durations @@ -793,7 +787,7 @@ class VitsGeneratorLoss(nn.Module): else: pass # ToDo: implement soft dtw - + # pass losses to the dict return_dict["loss_gen"] = loss_gen return_dict["loss_kl"] = loss_kl @@ -854,7 +848,7 @@ class VitsDiscriminatorLoss(nn.Module): loss_disc_end2end, loss_disc_real_end2end, _ = self.discriminator_loss( scores_real=end2end_info["scores_disc_real"], scores_fake=end2end_info["scores_disc_fake"], ) - return_dict["loss_disc_end2end"] = loss_disc_end2end * self.disc_loss_alpha + return_dict["loss_disc_end2end"] = loss_disc_end2end * self.disc_loss_alpha return_dict["loss"] += return_dict["loss_disc_end2end"] for i, ldr in enumerate(loss_disc_real_end2end): diff --git a/TTS/tts/layers/vits/discriminator.py b/TTS/tts/layers/vits/discriminator.py index 6efcc069..8ec67d1e 100644 --- a/TTS/tts/layers/vits/discriminator.py +++ b/TTS/tts/layers/vits/discriminator.py @@ -94,9 +94,9 @@ class VitsDiscriminator(nn.Module): mp_scores, zp_scores, mp_feats, zp_feats = None, None, None, None if self.disc_latent is not None: if m_p is not None: - mp_scores, mp_feats = self.disc_latent(m_p.unsqueeze(1)) + mp_scores, mp_feats = self.disc_latent(m_p.unsqueeze(-1)) if z_p is not None: - zp_scores, zp_feats = self.disc_latent(z_p.unsqueeze(1)) + zp_scores, zp_feats = self.disc_latent(z_p.unsqueeze(-1)) return x_scores, x_feats, x_hat_scores, x_hat_feats, mp_scores, mp_feats, zp_scores, zp_feats @@ -107,7 +107,6 @@ class LatentDiscriminator(nn.Module): def __init__(self, use_spectral_norm=False, hidden_channels=None): super().__init__() norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm - self.hidden_channels = hidden_channels self.discriminators = nn.ModuleList( [ norm_f(nn.Conv2d(1 if hidden_channels is None else hidden_channels, 32, kernel_size=(3, 9), padding=(1, 4))), @@ -122,8 +121,6 @@ class LatentDiscriminator(nn.Module): def forward(self, y): fmap = [] - if self.hidden_channels is not None: - y = y.squeeze(1).unsqueeze(-1) for _, d in enumerate(self.discriminators): y = d(y) y = torch.nn.functional.leaky_relu(y, 0.1) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index bc96d0c3..ca5d607e 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -20,7 +20,7 @@ from TTS.tts.datasets.dataset import TTSDataset, _parse_sample from TTS.tts.layers.generic.classifier import ReversalClassifier from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.vits.prosody_encoder import VitsGST, VitsVAE -from TTS.tts.layers.vits.discriminator import VitsDiscriminator, LatentDiscriminator +from TTS.tts.layers.vits.discriminator import VitsDiscriminator from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor @@ -556,7 +556,7 @@ class VitsArgs(Coqpit): use_prosody_encoder_z_p_input: bool = False use_prosody_enc_spk_reversal_classifier: bool = False use_prosody_enc_emo_classifier: bool = False - + use_noise_scale_predictor: bool = False use_prosody_conditional_flow_module: bool = False @@ -567,7 +567,6 @@ class VitsArgs(Coqpit): use_soft_dtw: bool = False use_latent_discriminator: bool = False - provide_hidden_dim_on_the_latent_discriminator: bool = False detach_dp_input: bool = True use_language_embedding: bool = False @@ -686,10 +685,10 @@ class Vits(BaseTTS): dp_extra_inp_dim = 0 if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) and not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor: - dp_extra_inp_dim += self.args.emotion_embedding_dim + dp_extra_inp_dim += self.args.emotion_embedding_dim if self.args.use_prosody_encoder and not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor: - dp_extra_inp_dim += self.args.prosody_embedding_dim + dp_extra_inp_dim += self.args.prosody_embedding_dim if self.args.use_sdp: self.duration_predictor = StochasticDurationPredictor( @@ -724,7 +723,7 @@ class Vits(BaseTTS): num_mel=self.args.hidden_channels, capacitron_VAE_embedding_dim=self.args.prosody_embedding_dim, ) - else: + else: raise RuntimeError( f" [!] The Prosody encoder type {self.args.prosody_encoder_type} is not supported !!" ) @@ -734,7 +733,7 @@ class Vits(BaseTTS): out_channels=self.num_speakers, hidden_channels=256, ) - if self.args.use_prosody_enc_emo_classifier: + if self.args.use_prosody_enc_emo_classifier: self.pros_enc_emotion_classifier = ReversalClassifier( in_channels=self.args.prosody_embedding_dim, out_channels=self.num_emotions, @@ -817,7 +816,7 @@ class Vits(BaseTTS): periods=self.args.periods_multi_period_discriminator, use_spectral_norm=self.args.use_spectral_norm_disriminator, use_latent_disc=self.args.use_latent_discriminator, - hidden_channels=self.args.hidden_channels if self.args.provide_hidden_dim_on_the_latent_discriminator else None, + hidden_channels=self.args.hidden_channels, ) def init_multispeaker(self, config: Coqpit): @@ -952,7 +951,7 @@ class Vits(BaseTTS): if value == before_dict[key]: raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !") print(" > Text Encoder was reinit.") - + def init_emotion(self, emotion_manager: EmotionManager): # pylint: disable=attribute-defined-outside-init """Initialize emotion modules of a model. A model can be trained either with a emotion embedding layer @@ -1345,7 +1344,7 @@ class Vits(BaseTTS): z_p_end2end = self.prosody_conditional_module(z_p_end2end, y_mask_end2end, g=eg if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) else pros_emb, reverse=True) z_end2end = self.flow(z_p_end2end, y_mask_end2end, g=g, reverse=True) - + # interpolate z if needed z_end2end, _, _, y_mask_end2end = self.upsampling_z(z, y_lengths=y_lengths_end2end, y_mask=y_mask_end2end) # z_slice_end2end, spec_segment_size, slice_ids_end2end, _ = self.upsampling_z(z_slice_end2end, slice_ids=slice_ids_end2end) @@ -1505,7 +1504,7 @@ class Vits(BaseTTS): m_p = torch.matmul(attn.transpose(1, 2), m_p.transpose(1, 2)).transpose(1, 2) logs_p = torch.matmul(attn.transpose(1, 2), logs_p.transpose(1, 2)).transpose(1, 2) - + if self.args.use_noise_scale_predictor: nsp_input = torch.transpose(m_p, 1, -1) if self.args.use_prosody_encoder and pros_emb is not None: @@ -1850,7 +1849,7 @@ class Vits(BaseTTS): if style_wav and style_speaker_name is None: raise RuntimeError( - f" [!] You must to provide the style_speaker_name for the style_wav !!" + " [!] You must to provide the style_speaker_name for the style_wav !!" ) # get speaker id/d_vector diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 3e328506..4283750d 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -423,7 +423,7 @@ class Synthesizer(object): source_emotion_feature, target_emotion_feature = None, None if source_emotion is not None and target_emotion is not None and not getattr(self.tts_model, "prosody_encoder", False) and (self.tts_emotions_file or ( getattr(self.tts_model, "emotion_manager", None) and getattr(self.tts_model.emotion_manager, "ids", None) - )): + )): # pylint: disable=R0916 if source_emotion and isinstance(source_emotion, str): if getattr(self.tts_config, "use_external_emotions_embeddings", False) or ( getattr(self.tts_config, "model_args", None) diff --git a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py b/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py index 4a160fa5..d167d211 100644 --- a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py +++ b/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py @@ -53,7 +53,6 @@ config.model_args.prosody_encoder_type = "gst" config.model_args.detach_prosody_enc_input = True config.model_args.use_latent_discriminator = True -config.model_args.provide_hidden_dim_on_the_latent_discriminator = True config.model_args.use_noise_scale_predictor = False # enable end2end loss