diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 526f8e7b..d76b0e22 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -558,10 +558,6 @@ class VitsArgs(Coqpit): use_prosody_enc_emo_classifier: bool = False use_noise_scale_predictor: bool = False - - use_prosody_conditional_flow_module: bool = False - prosody_conditional_flow_module_on_decoder: bool = False - use_latent_discriminator: bool = False detach_dp_input: bool = True @@ -648,8 +644,8 @@ class Vits(BaseTTS): self.args.kernel_size_text_encoder, self.args.dropout_p_text_encoder, language_emb_dim=self.embedded_language_dim, - emotion_emb_dim=self.args.emotion_embedding_dim if not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor else 0, - prosody_emb_dim=self.args.prosody_embedding_dim if not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor else 0, + emotion_emb_dim=self.args.emotion_embedding_dim if not self.args.use_noise_scale_predictor else 0, + prosody_emb_dim=self.args.prosody_embedding_dim if not self.args.use_noise_scale_predictor else 0, ) self.posterior_encoder = PosteriorEncoder( @@ -680,10 +676,10 @@ class Vits(BaseTTS): dp_cond_embedding_dim += self.args.prosody_embedding_dim dp_extra_inp_dim = 0 - if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) and not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor: + if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) and not self.args.use_noise_scale_predictor: dp_extra_inp_dim += self.args.emotion_embedding_dim - if self.args.use_prosody_encoder and not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor: + if self.args.use_prosody_encoder and not self.args.use_noise_scale_predictor: dp_extra_inp_dim += self.args.prosody_embedding_dim if self.args.use_sdp: @@ -737,23 +733,6 @@ class Vits(BaseTTS): reversal=False ) - if self.args.use_prosody_conditional_flow_module: - cond_embedding_dim = 0 - if self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings: - cond_embedding_dim += self.args.emotion_embedding_dim - - if self.args.use_prosody_encoder: - cond_embedding_dim += self.args.prosody_embedding_dim - - self.prosody_conditional_module = ResidualCouplingBlocks( - self.args.hidden_channels, - self.args.hidden_channels, - kernel_size=self.args.kernel_size_flow, - dilation_rate=self.args.dilation_rate_flow, - num_layers=2, - cond_channels=cond_embedding_dim, - ) - if self.args.use_noise_scale_predictor: noise_scale_predictor_input_dim = self.args.hidden_channels if self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings: @@ -1212,8 +1191,8 @@ class Vits(BaseTTS): x, x_lengths, lang_emb=lang_emb, - emo_emb=eg if not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor else None, - pros_emb=pros_emb if not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor else None + emo_emb=eg if not self.args.use_noise_scale_predictor else None, + pros_emb=pros_emb if not self.args.use_noise_scale_predictor else None ) # reversal speaker loss to force the encoder to be speaker identity free @@ -1221,23 +1200,10 @@ class Vits(BaseTTS): if self.args.use_text_enc_spk_reversal_classifier: _, l_text_speaker = self.speaker_text_enc_reversal_classifier(x.transpose(1, 2), sid, x_mask=None) - # reversal speaker loss to force the encoder to be speaker identity free l_text_emotion = None - if self.args.use_text_enc_emo_classifier: - if self.args.prosody_conditional_flow_module_on_decoder: - _, l_text_emotion = self.emo_text_enc_classifier(z_p.transpose(1, 2), eid, x_mask=y_mask) - - # conditional module - if self.args.use_prosody_conditional_flow_module: - if not self.args.prosody_conditional_flow_module_on_decoder: - m_p = self.prosody_conditional_module(m_p, x_mask, g=eg if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) else pros_emb) - else: - z_p = self.prosody_conditional_module(z_p, y_mask, g=eg if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) else pros_emb) - # reversal speaker loss to force the encoder to be speaker identity free if self.args.use_text_enc_emo_classifier: - if not self.args.prosody_conditional_flow_module_on_decoder: - _, l_text_emotion = self.emo_text_enc_classifier(m_p.transpose(1, 2), eid, x_mask=x_mask) + _, l_text_emotion = self.emo_text_enc_classifier(m_p.transpose(1, 2), eid, x_mask=x_mask) # duration predictor g_dp = g if self.args.condition_dp_on_speaker else None @@ -1406,15 +1372,10 @@ class Vits(BaseTTS): x, x_lengths, lang_emb=lang_emb, - emo_emb=eg if not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor else None, - pros_emb=pros_emb if not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor else None + emo_emb=eg if not self.args.use_noise_scale_predictor else None, + pros_emb=pros_emb if not self.args.use_noise_scale_predictor else None ) - # conditional module - if self.args.use_prosody_conditional_flow_module: - if not self.args.prosody_conditional_flow_module_on_decoder: - m_p = self.prosody_conditional_module(m_p, x_mask, g=eg if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) else pros_emb) - # duration predictor g_dp = g if self.args.condition_dp_on_speaker else None if eg is not None and (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings): @@ -1466,11 +1427,6 @@ class Vits(BaseTTS): else: z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * self.inference_noise_scale - # conditional module - if self.args.use_prosody_conditional_flow_module: - if self.args.prosody_conditional_flow_module_on_decoder: - z_p = self.prosody_conditional_module(z_p, y_mask, g=eg if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) else pros_emb, reverse=True) - z = self.flow(z_p, y_mask, g=g, reverse=True) # upsampling if needed @@ -1507,7 +1463,7 @@ class Vits(BaseTTS): @torch.no_grad() def inference_voice_conversion( - self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None, ref_emotion=None, target_emotion=None + self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None ): """Inference for voice conversion @@ -1530,11 +1486,11 @@ class Vits(BaseTTS): speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector # print(y.shape, y_lengths.shape) - wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt, ref_emotion, target_emotion) + wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt) return wav - def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt, ref_emotion=None, target_emotion=None): + def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): """Forward pass for voice conversion TODO: create an end-point for voice conversion @@ -1555,27 +1511,9 @@ class Vits(BaseTTS): g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) else: raise RuntimeError(" [!] Voice conversion is only supported on multi-speaker models.") - # emotion embedding - ge_src, ge_tgt = None, None - if self.args.use_emotion_embedding and ref_emotion is not None and target_emotion is not None: - ge_src = self.emb_g(ref_emotion).unsqueeze(-1) - ge_tgt = self.emb_g(target_emotion).unsqueeze(-1) - elif self.args.use_external_emotions_embeddings and ref_emotion is not None and target_emotion is not None: - ge_src = F.normalize(ref_emotion).unsqueeze(-1) - ge_tgt = F.normalize(target_emotion).unsqueeze(-1) z, _, _, y_mask = self.posterior_encoder(y, y_lengths, g=g_src) z_p = self.flow(z, y_mask, g=g_src) - - # change the emotion - if ge_tgt is not None and ge_tgt is not None and self.args.use_prosody_conditional_flow_module: - if not self.args.prosody_conditional_flow_module_on_decoder: - ze = self.prosody_conditional_module(z_p, y_mask, g=ge_src, reverse=True) - z_p = self.prosody_conditional_module(ze, y_mask, g=ge_tgt) - else: - ze = self.prosody_conditional_module(z_p, y_mask, g=ge_src) - z_p = self.prosody_conditional_module(ze, y_mask, g=ge_tgt, reverse=True) - z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt) return o_hat, y_mask, (z, z_p, z_hat) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 82ce79c6..b625b39e 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -281,8 +281,6 @@ def transfer_voice( reference_d_vector=None, do_trim_silence=False, use_griffin_lim=False, - source_emotion_feature=None, - target_emotion_feature=None, ): """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to the vocoder model. @@ -328,12 +326,6 @@ def transfer_voice( if reference_d_vector is not None: reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda) - if source_emotion_feature is not None: - source_emotion_feature = embedding_to_torch(source_emotion_feature, cuda=use_cuda) - - if target_emotion_feature is not None: - target_emotion_feature = embedding_to_torch(target_emotion_feature, cuda=use_cuda) - # load reference_wav audio reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda) @@ -341,7 +333,7 @@ def transfer_voice( _func = model.module.inference_voice_conversion else: _func = model.inference_voice_conversion - model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector, ref_emotion=source_emotion_feature, target_emotion=target_emotion_feature) + model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector) # convert outputs to numpy # plot results diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 4283750d..5ff7e41f 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -214,8 +214,6 @@ class Synthesizer(object): reference_wav=None, reference_speaker_name=None, emotion_name=None, - source_emotion=None, - target_emotion=None, style_speaker_name=None, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -418,32 +416,6 @@ class Synthesizer(object): reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip( reference_wav ) - # get the emotions embeddings - # handle emotion - source_emotion_feature, target_emotion_feature = None, None - if source_emotion is not None and target_emotion is not None and not getattr(self.tts_model, "prosody_encoder", False) and (self.tts_emotions_file or ( - getattr(self.tts_model, "emotion_manager", None) and getattr(self.tts_model.emotion_manager, "ids", None) - )): # pylint: disable=R0916 - if source_emotion and isinstance(source_emotion, str): - if getattr(self.tts_config, "use_external_emotions_embeddings", False) or ( - getattr(self.tts_config, "model_args", None) - and getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False) - ): - # get the average emotion embedding from the saved embeddings. - source_emotion_feature = self.tts_model.emotion_manager.get_mean_embedding( - source_emotion, num_samples=None, randomize=False - ) - source_emotion_feature = np.array(source_emotion_feature)[None, :] # [1 x embedding_dim] - # target - target_emotion_feature = self.tts_model.emotion_manager.get_mean_embedding( - target_emotion, num_samples=None, randomize=False - ) - target_emotion_feature = np.array(target_emotion_feature)[None, :] # [1 x embedding_dim] - else: - # get emotion idx - source_emotion_feature = self.tts_model.emotion_manager.ids[source_emotion] - target_emotion_feature = self.tts_model.emotion_manager.ids[target_emotion] - outputs = transfer_voice( model=self.tts_model, @@ -454,9 +426,7 @@ class Synthesizer(object): d_vector=speaker_embedding, use_griffin_lim=use_gl, reference_speaker_id=reference_speaker_id, - reference_d_vector=reference_speaker_embedding, - source_emotion_feature=source_emotion_feature, - target_emotion_feature=target_emotion_feature, + reference_d_vector=reference_speaker_embedding ) waveform = outputs if not use_gl: