From bf45319f643a5b7ed4f65f5cfdb9a92305658239 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Tue, 7 Jun 2022 09:27:08 -0300 Subject: [PATCH] Add speaker and emotion squeezer layers --- TTS/tts/models/vits.py | 34 ++++++++++++++++++- ...est_vits_speaker_emb_with_emotion_train.py | 12 ++++--- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index d76b0e22..ee8ee7bb 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -546,6 +546,14 @@ class VitsArgs(Coqpit): use_text_enc_spk_reversal_classifier: bool = False use_text_enc_emo_classifier: bool = False + # emotion and speaker embedding squeezer + use_emotion_embedding_squeezer: bool = False + emotion_embedding_squeezer_input_dim: int = 0 + use_speaker_embedding_squeezer: bool = False + speaker_embedding_squeezer_input_dim: int = 0 + + use_speaker_embedding_as_emotion: bool = False + # prosody encoder use_prosody_encoder: bool = False prosody_encoder_type: str = "gst" @@ -676,7 +684,7 @@ class Vits(BaseTTS): dp_cond_embedding_dim += self.args.prosody_embedding_dim dp_extra_inp_dim = 0 - if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) and not self.args.use_noise_scale_predictor: + if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings or self.args.use_speaker_embedding_as_emotion) and not self.args.use_noise_scale_predictor: dp_extra_inp_dim += self.args.emotion_embedding_dim if self.args.use_prosody_encoder and not self.args.use_noise_scale_predictor: @@ -754,6 +762,12 @@ class Vits(BaseTTS): rel_attn_window_size=4, ) + if self.args.use_emotion_embedding_squeezer: + self.emotion_embedding_squeezer = nn.Linear(in_features=self.args.emotion_embedding_squeezer_input_dim, out_features=self.args.emotion_embedding_dim) + + if self.args.use_speaker_embedding_squeezer: + self.speaker_embedding_squeezer = nn.Linear(in_features=self.args.speaker_embedding_squeezer_input_dim, out_features=self.cond_embedding_dim) + if self.args.use_text_enc_spk_reversal_classifier: self.speaker_text_enc_reversal_classifier = ReversalClassifier( in_channels=self.args.hidden_channels @@ -1162,6 +1176,15 @@ class Vits(BaseTTS): if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) + # squeezers + if self.args.use_emotion_embedding_squeezer: + if self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings: + eg = F.normalize(self.emotion_embedding_squeezer(eg.squeeze(-1))).unsqueeze(-1) + + if self.args.use_speaker_embedding_squeezer: + if self.args.use_speaker_embedding or self.args.use_d_vector_file: + g = F.normalize(self.speaker_embedding_squeezer(g.squeeze(-1))).unsqueeze(-1) + # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) @@ -1350,6 +1373,15 @@ class Vits(BaseTTS): if self.args.use_language_embedding and lid is not None: lang_emb = self.emb_l(lid).unsqueeze(-1) + # squeezers + if self.args.use_emotion_embedding_squeezer: + if self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings: + eg = F.normalize(self.emotion_embedding_squeezer(eg.squeeze(-1))).unsqueeze(-1) + + if self.args.use_speaker_embedding_squeezer: + if self.args.use_speaker_embedding or self.args.use_d_vector_file: + g = F.normalize(self.speaker_embedding_squeezer(g.squeeze(-1))).unsqueeze(-1) + # prosody embedding pros_emb = None if self.args.use_prosody_encoder: diff --git a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py index e89e538b..28b8f203 100644 --- a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py @@ -38,19 +38,21 @@ config.model_args.use_speaker_embedding = False config.model_args.use_d_vector_file = True config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" config.model_args.speaker_embedding_channels = 128 -config.model_args.d_vector_dim = 256 +config.model_args.d_vector_dim = 100 # emotion config.model_args.use_external_emotions_embeddings = True config.model_args.use_emotion_embedding = False -config.model_args.emotion_embedding_dim = 256 +config.model_args.emotion_embedding_dim = 64 config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json" config.model_args.use_text_enc_spk_reversal_classifier = False -config.model_args.use_prosody_conditional_flow_module = True -config.model_args.prosody_conditional_flow_module_on_decoder = True -config.model_args.use_text_enc_emo_classifier = True +config.model_args.use_emotion_embedding_squeezer = True +config.model_args.emotion_embedding_squeezer_input_dim = 256 + +config.model_args.use_speaker_embedding_squeezer = True +config.model_args.speaker_embedding_squeezer_input_dim = 256 # consistency loss # config.model_args.use_emotion_encoder_as_loss = True