mirror of https://github.com/coqui-ai/TTS.git
Add speaker and emotion squeezer layers
This commit is contained in:
parent
a309edacb4
commit
e069985f17
|
@ -546,6 +546,14 @@ class VitsArgs(Coqpit):
|
||||||
use_text_enc_spk_reversal_classifier: bool = False
|
use_text_enc_spk_reversal_classifier: bool = False
|
||||||
use_text_enc_emo_classifier: bool = False
|
use_text_enc_emo_classifier: bool = False
|
||||||
|
|
||||||
|
# emotion and speaker embedding squeezer
|
||||||
|
use_emotion_embedding_squeezer: bool = False
|
||||||
|
emotion_embedding_squeezer_input_dim: int = 0
|
||||||
|
use_speaker_embedding_squeezer: bool = False
|
||||||
|
speaker_embedding_squeezer_input_dim: int = 0
|
||||||
|
|
||||||
|
use_speaker_embedding_as_emotion: bool = False
|
||||||
|
|
||||||
# prosody encoder
|
# prosody encoder
|
||||||
use_prosody_encoder: bool = False
|
use_prosody_encoder: bool = False
|
||||||
prosody_encoder_type: str = "gst"
|
prosody_encoder_type: str = "gst"
|
||||||
|
@ -676,7 +684,7 @@ class Vits(BaseTTS):
|
||||||
dp_cond_embedding_dim += self.args.prosody_embedding_dim
|
dp_cond_embedding_dim += self.args.prosody_embedding_dim
|
||||||
|
|
||||||
dp_extra_inp_dim = 0
|
dp_extra_inp_dim = 0
|
||||||
if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) and not self.args.use_noise_scale_predictor:
|
if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings or self.args.use_speaker_embedding_as_emotion) and not self.args.use_noise_scale_predictor:
|
||||||
dp_extra_inp_dim += self.args.emotion_embedding_dim
|
dp_extra_inp_dim += self.args.emotion_embedding_dim
|
||||||
|
|
||||||
if self.args.use_prosody_encoder and not self.args.use_noise_scale_predictor:
|
if self.args.use_prosody_encoder and not self.args.use_noise_scale_predictor:
|
||||||
|
@ -754,6 +762,12 @@ class Vits(BaseTTS):
|
||||||
rel_attn_window_size=4,
|
rel_attn_window_size=4,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if self.args.use_emotion_embedding_squeezer:
|
||||||
|
self.emotion_embedding_squeezer = nn.Linear(in_features=self.args.emotion_embedding_squeezer_input_dim, out_features=self.args.emotion_embedding_dim)
|
||||||
|
|
||||||
|
if self.args.use_speaker_embedding_squeezer:
|
||||||
|
self.speaker_embedding_squeezer = nn.Linear(in_features=self.args.speaker_embedding_squeezer_input_dim, out_features=self.cond_embedding_dim)
|
||||||
|
|
||||||
if self.args.use_text_enc_spk_reversal_classifier:
|
if self.args.use_text_enc_spk_reversal_classifier:
|
||||||
self.speaker_text_enc_reversal_classifier = ReversalClassifier(
|
self.speaker_text_enc_reversal_classifier = ReversalClassifier(
|
||||||
in_channels=self.args.hidden_channels
|
in_channels=self.args.hidden_channels
|
||||||
|
@ -1162,6 +1176,15 @@ class Vits(BaseTTS):
|
||||||
if self.args.use_language_embedding and lid is not None:
|
if self.args.use_language_embedding and lid is not None:
|
||||||
lang_emb = self.emb_l(lid).unsqueeze(-1)
|
lang_emb = self.emb_l(lid).unsqueeze(-1)
|
||||||
|
|
||||||
|
# squeezers
|
||||||
|
if self.args.use_emotion_embedding_squeezer:
|
||||||
|
if self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings:
|
||||||
|
eg = F.normalize(self.emotion_embedding_squeezer(eg.squeeze(-1))).unsqueeze(-1)
|
||||||
|
|
||||||
|
if self.args.use_speaker_embedding_squeezer:
|
||||||
|
if self.args.use_speaker_embedding or self.args.use_d_vector_file:
|
||||||
|
g = F.normalize(self.speaker_embedding_squeezer(g.squeeze(-1))).unsqueeze(-1)
|
||||||
|
|
||||||
# posterior encoder
|
# posterior encoder
|
||||||
z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g)
|
z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g)
|
||||||
|
|
||||||
|
@ -1350,6 +1373,15 @@ class Vits(BaseTTS):
|
||||||
if self.args.use_language_embedding and lid is not None:
|
if self.args.use_language_embedding and lid is not None:
|
||||||
lang_emb = self.emb_l(lid).unsqueeze(-1)
|
lang_emb = self.emb_l(lid).unsqueeze(-1)
|
||||||
|
|
||||||
|
# squeezers
|
||||||
|
if self.args.use_emotion_embedding_squeezer:
|
||||||
|
if self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings:
|
||||||
|
eg = F.normalize(self.emotion_embedding_squeezer(eg.squeeze(-1))).unsqueeze(-1)
|
||||||
|
|
||||||
|
if self.args.use_speaker_embedding_squeezer:
|
||||||
|
if self.args.use_speaker_embedding or self.args.use_d_vector_file:
|
||||||
|
g = F.normalize(self.speaker_embedding_squeezer(g.squeeze(-1))).unsqueeze(-1)
|
||||||
|
|
||||||
# prosody embedding
|
# prosody embedding
|
||||||
pros_emb = None
|
pros_emb = None
|
||||||
if self.args.use_prosody_encoder:
|
if self.args.use_prosody_encoder:
|
||||||
|
|
|
@ -38,19 +38,21 @@ config.model_args.use_speaker_embedding = False
|
||||||
config.model_args.use_d_vector_file = True
|
config.model_args.use_d_vector_file = True
|
||||||
config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
|
config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
|
||||||
config.model_args.speaker_embedding_channels = 128
|
config.model_args.speaker_embedding_channels = 128
|
||||||
config.model_args.d_vector_dim = 256
|
config.model_args.d_vector_dim = 100
|
||||||
|
|
||||||
# emotion
|
# emotion
|
||||||
config.model_args.use_external_emotions_embeddings = True
|
config.model_args.use_external_emotions_embeddings = True
|
||||||
config.model_args.use_emotion_embedding = False
|
config.model_args.use_emotion_embedding = False
|
||||||
config.model_args.emotion_embedding_dim = 256
|
config.model_args.emotion_embedding_dim = 64
|
||||||
config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json"
|
config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json"
|
||||||
config.model_args.use_text_enc_spk_reversal_classifier = False
|
config.model_args.use_text_enc_spk_reversal_classifier = False
|
||||||
|
|
||||||
|
|
||||||
config.model_args.use_prosody_conditional_flow_module = True
|
config.model_args.use_emotion_embedding_squeezer = True
|
||||||
config.model_args.prosody_conditional_flow_module_on_decoder = True
|
config.model_args.emotion_embedding_squeezer_input_dim = 256
|
||||||
config.model_args.use_text_enc_emo_classifier = True
|
|
||||||
|
config.model_args.use_speaker_embedding_squeezer = True
|
||||||
|
config.model_args.speaker_embedding_squeezer_input_dim = 256
|
||||||
|
|
||||||
# consistency loss
|
# consistency loss
|
||||||
# config.model_args.use_emotion_encoder_as_loss = True
|
# config.model_args.use_emotion_encoder_as_loss = True
|
||||||
|
|
Loading…
Reference in New Issue