From be77e24a39dcc65c842096d9c289b4091811ddcf Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 28 Jul 2020 17:11:32 -0300 Subject: [PATCH] bugfix in DDC now DDC work on Tacotron1 --- mozilla_voice_tts/tts/configs/config.json | 8 ++- mozilla_voice_tts/tts/models/tacotron.py | 18 +++---- mozilla_voice_tts/tts/models/tacotron2.py | 54 ++++++------------- .../tts/models/tacotron_abstract.py | 19 +++++-- mozilla_voice_tts/tts/utils/generic_utils.py | 6 +++ 5 files changed, 52 insertions(+), 53 deletions(-) diff --git a/mozilla_voice_tts/tts/configs/config.json b/mozilla_voice_tts/tts/configs/config.json index 090540ab..70529fea 100644 --- a/mozilla_voice_tts/tts/configs/config.json +++ b/mozilla_voice_tts/tts/configs/config.json @@ -1,5 +1,5 @@ { - "model": "Tacotron2", + "model": "Tacotron", "run_name": "ljspeech-ddc-bn", "run_description": "tacotron2 with ddc and batch-normalization", @@ -114,7 +114,7 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING - "text_cleaner": "phoneme_cleaners", + "text_cleaner": "portuguese_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. @@ -131,9 +131,13 @@ "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST +<<<<<<< HEAD:mozilla_voice_tts/tts/configs/config.json "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 +======= + "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. +>>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/configs/config.json "use_gst": true, // use global style tokens "gst": { // gst parameter if gst is enabled "gst_style_input": null, // Condition the style input either on a diff --git a/mozilla_voice_tts/tts/models/tacotron.py b/mozilla_voice_tts/tts/models/tacotron.py index 682d2b59..1395de97 100644 --- a/mozilla_voice_tts/tts/models/tacotron.py +++ b/mozilla_voice_tts/tts/models/tacotron.py @@ -42,19 +42,13 @@ class Tacotron(TacotronAbstract): bidirectional_decoder, double_decoder_consistency, ddc_r, gst) + # init layer dims decoder_in_features = 256 encoder_in_features = 256 + speaker_embedding_dim = 256 + proj_speaker_dim = 80 if num_speakers > 1 else 0 - if speaker_embedding_dim is None: - # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim - self.embeddings_per_sample = False - speaker_embedding_dim = 256 - else: - # if speaker_embedding_dim is not None we need use speaker embedding per sample - self.embeddings_per_sample = True - - # speaker and gst embeddings is concat in decoder input if num_speakers > 1: decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim if self.gst: @@ -109,6 +103,9 @@ class Tacotron(TacotronAbstract): input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) # B x T_in x embed_dim inputs = self.embedding(characters) + # B x speaker_embed_dim + if speaker_ids is not None: + self.compute_speaker_embedding(speaker_ids) # B x T_in x encoder_in_features encoder_outputs = self.encoder(inputs) # sequence masking @@ -155,6 +152,9 @@ class Tacotron(TacotronAbstract): @torch.no_grad() def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): inputs = self.embedding(characters) + self._init_states() + if speaker_ids is not None: + self.compute_speaker_embedding(speaker_ids) encoder_outputs = self.encoder(inputs) if self.gst: # B x gst_dim diff --git a/mozilla_voice_tts/tts/models/tacotron2.py b/mozilla_voice_tts/tts/models/tacotron2.py index 0aa237ff..47057c56 100644 --- a/mozilla_voice_tts/tts/models/tacotron2.py +++ b/mozilla_voice_tts/tts/models/tacotron2.py @@ -1,9 +1,15 @@ import torch from torch import nn +<<<<<<< HEAD:mozilla_voice_tts/tts/models/tacotron2.py from mozilla_voice_tts.tts.layers.gst_layers import GST from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract +======= +from TTS.tts.layers.gst_layers import GST +from TTS.tts.layers.tacotron2 import Decoder, Encoder, Postnet +from TTS.tts.models.tacotron_abstract import TacotronAbstract +>>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/models/tacotron2.py # TODO: match function arguments with tacotron class Tacotron2(TacotronAbstract): @@ -85,24 +91,6 @@ class Tacotron2(TacotronAbstract): mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments - def compute_gst(self, inputs, style_input): - """ Compute global style token """ - device = inputs.device - if isinstance(style_input, dict): - query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) - _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - for k_token, v_amplifier in style_input.items(): - key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) - gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) - gst_outputs = gst_outputs + gst_outputs_att * v_amplifier - elif style_input is None: - gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) - else: - gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable - embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1) - return inputs, embedded_gst - def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): # compute mask for padding # B x T_in_max (boolean) @@ -112,20 +100,13 @@ class Tacotron2(TacotronAbstract): # B x T_in_max x D_en encoder_outputs = self.encoder(embedded_inputs, text_lengths) + if self.gst: + # B x gst_dim + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) + if self.num_speakers > 1: embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] - embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) - if self.gst: - # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) - else: - encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) - else: - if self.gst: - # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -162,15 +143,14 @@ class Tacotron2(TacotronAbstract): embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) if self.gst: # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) + encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: if self.gst: # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) decoder_outputs, alignments, stop_tokens = self.decoder.inference( encoder_outputs) @@ -192,15 +172,13 @@ class Tacotron2(TacotronAbstract): embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) if self.gst: # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) else: encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) else: if self.gst: # B x gst_dim - encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) - encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1) + encoder_outputs = self.compute_gst(encoder_outputs, style_mel) mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( encoder_outputs) diff --git a/mozilla_voice_tts/tts/models/tacotron_abstract.py b/mozilla_voice_tts/tts/models/tacotron_abstract.py index a4b8c227..6f3d32ad 100644 --- a/mozilla_voice_tts/tts/models/tacotron_abstract.py +++ b/mozilla_voice_tts/tts/models/tacotron_abstract.py @@ -164,11 +164,22 @@ class TacotronAbstract(ABC, nn.Module): self.speaker_embeddings_projected = self.speaker_project_mel( self.speaker_embeddings).squeeze(1) - def compute_gst(self, inputs, mel_specs): + def compute_gst(self, inputs, style_input): """ Compute global style token """ - # pylint: disable=not-callable - gst_outputs = self.gst_layer(mel_specs) - inputs = self._add_speaker_embedding(inputs, gst_outputs) + device = inputs.device + if isinstance(style_input, dict): + query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device) + _GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens) + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + for k_token, v_amplifier in style_input.items(): + key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1) + gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key) + gst_outputs = gst_outputs + gst_outputs_att * v_amplifier + elif style_input is None: + gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device) + else: + gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable + inputs = self._concat_speaker_embedding(inputs, gst_outputs) return inputs @staticmethod diff --git a/mozilla_voice_tts/tts/utils/generic_utils.py b/mozilla_voice_tts/tts/utils/generic_utils.py index 2d5044ef..fc35840d 100644 --- a/mozilla_voice_tts/tts/utils/generic_utils.py +++ b/mozilla_voice_tts/tts/utils/generic_utils.py @@ -265,6 +265,12 @@ def check_config(c): check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10) check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000) + check_argument('gst', c, restricted=True, val_type=dict) + check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict]) + check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000) + check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10) + check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000) + # datasets - checking only the first entry check_argument('datasets', c, restricted=True, val_type=list) for dataset_entry in c['datasets']: