diff --git a/mozilla_voice_tts/bin/train_tts.py b/mozilla_voice_tts/bin/train_tts.py index 1b9bc032..f7bb0b60 100644 --- a/mozilla_voice_tts/bin/train_tts.py +++ b/mozilla_voice_tts/bin/train_tts.py @@ -523,6 +523,7 @@ def main(args): # pylint: disable=redefined-outer-name "a previously trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + print(speaker_mapping) speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" @@ -558,6 +559,8 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) + for name, _ in model.named_parameters(): + print(name) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') diff --git a/mozilla_voice_tts/tts/configs/config.json b/mozilla_voice_tts/tts/configs/config.json index 70529fea..2a61ba03 100644 --- a/mozilla_voice_tts/tts/configs/config.json +++ b/mozilla_voice_tts/tts/configs/config.json @@ -1,5 +1,5 @@ { - "model": "Tacotron", + "model": "Tacotron2", "run_name": "ljspeech-ddc-bn", "run_description": "tacotron2 with ddc and batch-normalization", @@ -114,7 +114,7 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING - "text_cleaner": "portuguese_cleaners", + "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. @@ -131,23 +131,19 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST -<<<<<<< HEAD:mozilla_voice_tts/tts/configs/config.json "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 -======= - "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. ->>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/configs/config.json "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled - "gst_style_input": null, // Condition the style input either on a - // -> wave file [path to wave] or - // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} + "gst_style_input": null, // Condition the style input either on a + // -> wave file [path to wave] or + // -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15} // with the dictionary being len(dict) <= len(gst_style_tokens). - "gst_embedding_dim": 512, + "gst_embedding_dim": 512, "gst_num_heads": 4, "gst_style_tokens": 10 - }, + }, // DATASETS "datasets": // List of datasets. They all merged and they get different speaker_ids. diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index 1395de97..bcc4a2a6 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -42,13 +42,19 @@ class Tacotron(TacotronAbstract): bidirectional_decoder, double_decoder_consistency, ddc_r, gst) - # init layer dims decoder_in_features = 256 encoder_in_features = 256 - speaker_embedding_dim = 256 - proj_speaker_dim = 80 if num_speakers > 1 else 0 + if speaker_embedding_dim is None: + # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim + self.embeddings_per_sample = False + speaker_embedding_dim = 256 + else: + # if speaker_embedding_dim is not None we need use speaker embedding per sample + self.embeddings_per_sample = True + + # speaker and gst embeddings is concat in decoder input if num_speakers > 1: decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim if self.gst: @@ -103,14 +109,10 @@ class Tacotron(TacotronAbstract): input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim inputs = self.embedding(characters) - # B x speaker_embed_dim - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) - # global style token if self.gst: # B x gst_dim @@ -152,9 +154,6 @@ class Tacotron(TacotronAbstract): @torch.no_grad() def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): inputs = self.embedding(characters) - self._init_states() - if speaker_ids is not None: - self.compute_speaker_embedding(speaker_ids) encoder_outputs = self.encoder(inputs) if self.gst: # B x gst_dim diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 47057c56..c2fc8a32 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -33,6 +33,7 @@ class Tacotron2(TacotronAbstract): bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, + speaker_embedding_dim=None, gst=False, gst_embedding_dim=512, gst_num_heads=4, @@ -47,20 +48,33 @@ class Tacotron2(TacotronAbstract): ddc_r, gst) # init layer dims - speaker_embedding_dim = 512 if num_speakers > 1 else 0 - gst_embedding_dim = gst_embedding_dim if self.gst else 0 - decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim - encoder_in_features = 512 if num_speakers > 1 else 512 - proj_speaker_dim = 80 if num_speakers > 1 else 0 + decoder_in_features = 512 + encoder_in_features = 512 + + if speaker_embedding_dim is None: + # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim + self.embeddings_per_sample = False + speaker_embedding_dim = 512 + else: + # if speaker_embedding_dim is not None we need use speaker embedding per sample + self.embeddings_per_sample = True + + # speaker and gst embeddings is concat in decoder input + if num_speakers > 1: + decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim + if self.gst: + decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) # speaker embedding layer if num_speakers > 1: - self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) - self.speaker_embedding.weight.data.normal_(0, 0.3) - + if not self.embeddings_per_sample: + self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) + self.speaker_embedding.weight.data.normal_(0, 0.3) + + # base model layers self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, @@ -91,7 +105,7 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): + def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None): # compute mask for padding # B x T_in_max (boolean) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) @@ -105,8 +119,13 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) if self.num_speakers > 1: - embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) + if not self.embeddings_per_sample: + # B x 1 x speaker_embed_dim + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + else: + # B x 1 x speaker_embed_dim + speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -134,23 +153,18 @@ class Tacotron2(TacotronAbstract): return decoder_outputs, postnet_outputs, alignments, stop_tokens @torch.no_grad() - def inference(self, text, speaker_ids=None, style_mel=None): + def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if self.num_speakers > 1: - embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if not self.embeddings_per_sample: + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) decoder_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs) @@ -160,25 +174,21 @@ class Tacotron2(TacotronAbstract): decoder_outputs, postnet_outputs, alignments) return decoder_outputs, postnet_outputs, alignments, stop_tokens - def inference_truncated(self, text, speaker_ids=None, style_mel=None): + def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None): """ Preserve model states for continuous inference """ embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder.inference_truncated(embedded_inputs) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if self.num_speakers > 1: - embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) - else: - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - if self.gst: - # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + if not self.embeddings_per_sample: + speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None] + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( encoder_outputs)