mirror of https://github.com/coqui-ai/TTS.git
add External Embedding per sample instead of nn.Embedding
This commit is contained in:
parent
be77e24a39
commit
89d338358e
|
@ -523,6 +523,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
"a previously trained model."
|
"a previously trained model."
|
||||||
elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file
|
elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file
|
||||||
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file)
|
||||||
|
print(speaker_mapping)
|
||||||
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
|
speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding'])
|
||||||
elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file
|
elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file
|
||||||
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
|
||||||
|
@ -558,6 +559,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
# setup criterion
|
# setup criterion
|
||||||
criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)
|
criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4)
|
||||||
|
for name, _ in model.named_parameters():
|
||||||
|
print(name)
|
||||||
|
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
checkpoint = torch.load(args.restore_path, map_location='cpu')
|
checkpoint = torch.load(args.restore_path, map_location='cpu')
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"model": "Tacotron",
|
"model": "Tacotron2",
|
||||||
"run_name": "ljspeech-ddc-bn",
|
"run_name": "ljspeech-ddc-bn",
|
||||||
"run_description": "tacotron2 with ddc and batch-normalization",
|
"run_description": "tacotron2 with ddc and batch-normalization",
|
||||||
|
|
||||||
|
@ -114,7 +114,7 @@
|
||||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||||
|
|
||||||
// DATA LOADING
|
// DATA LOADING
|
||||||
"text_cleaner": "portuguese_cleaners",
|
"text_cleaner": "phoneme_cleaners",
|
||||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||||
|
@ -131,13 +131,9 @@
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
|
|
||||||
// MULTI-SPEAKER and GST
|
// MULTI-SPEAKER and GST
|
||||||
<<<<<<< HEAD:mozilla_voice_tts/tts/configs/config.json
|
|
||||||
"use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning.
|
"use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning.
|
||||||
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||||
"external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
"external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
|
||||||
=======
|
|
||||||
"use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning.
|
|
||||||
>>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/configs/config.json
|
|
||||||
"use_gst": true, // use global style tokens
|
"use_gst": true, // use global style tokens
|
||||||
"gst": { // gst parameter if gst is enabled
|
"gst": { // gst parameter if gst is enabled
|
||||||
"gst_style_input": null, // Condition the style input either on a
|
"gst_style_input": null, // Condition the style input either on a
|
||||||
|
|
|
@ -42,13 +42,19 @@ class Tacotron(TacotronAbstract):
|
||||||
bidirectional_decoder, double_decoder_consistency,
|
bidirectional_decoder, double_decoder_consistency,
|
||||||
ddc_r, gst)
|
ddc_r, gst)
|
||||||
|
|
||||||
|
|
||||||
# init layer dims
|
# init layer dims
|
||||||
decoder_in_features = 256
|
decoder_in_features = 256
|
||||||
encoder_in_features = 256
|
encoder_in_features = 256
|
||||||
speaker_embedding_dim = 256
|
|
||||||
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
|
||||||
|
|
||||||
|
if speaker_embedding_dim is None:
|
||||||
|
# if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim
|
||||||
|
self.embeddings_per_sample = False
|
||||||
|
speaker_embedding_dim = 256
|
||||||
|
else:
|
||||||
|
# if speaker_embedding_dim is not None we need use speaker embedding per sample
|
||||||
|
self.embeddings_per_sample = True
|
||||||
|
|
||||||
|
# speaker and gst embeddings is concat in decoder input
|
||||||
if num_speakers > 1:
|
if num_speakers > 1:
|
||||||
decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim
|
decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim
|
||||||
if self.gst:
|
if self.gst:
|
||||||
|
@ -103,14 +109,10 @@ class Tacotron(TacotronAbstract):
|
||||||
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||||
# B x T_in x embed_dim
|
# B x T_in x embed_dim
|
||||||
inputs = self.embedding(characters)
|
inputs = self.embedding(characters)
|
||||||
# B x speaker_embed_dim
|
|
||||||
if speaker_ids is not None:
|
|
||||||
self.compute_speaker_embedding(speaker_ids)
|
|
||||||
# B x T_in x encoder_in_features
|
# B x T_in x encoder_in_features
|
||||||
encoder_outputs = self.encoder(inputs)
|
encoder_outputs = self.encoder(inputs)
|
||||||
# sequence masking
|
# sequence masking
|
||||||
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
|
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
|
||||||
|
|
||||||
# global style token
|
# global style token
|
||||||
if self.gst:
|
if self.gst:
|
||||||
# B x gst_dim
|
# B x gst_dim
|
||||||
|
@ -152,9 +154,6 @@ class Tacotron(TacotronAbstract):
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None):
|
def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None):
|
||||||
inputs = self.embedding(characters)
|
inputs = self.embedding(characters)
|
||||||
self._init_states()
|
|
||||||
if speaker_ids is not None:
|
|
||||||
self.compute_speaker_embedding(speaker_ids)
|
|
||||||
encoder_outputs = self.encoder(inputs)
|
encoder_outputs = self.encoder(inputs)
|
||||||
if self.gst:
|
if self.gst:
|
||||||
# B x gst_dim
|
# B x gst_dim
|
||||||
|
|
|
@ -33,6 +33,7 @@ class Tacotron2(TacotronAbstract):
|
||||||
bidirectional_decoder=False,
|
bidirectional_decoder=False,
|
||||||
double_decoder_consistency=False,
|
double_decoder_consistency=False,
|
||||||
ddc_r=None,
|
ddc_r=None,
|
||||||
|
speaker_embedding_dim=None,
|
||||||
gst=False,
|
gst=False,
|
||||||
gst_embedding_dim=512,
|
gst_embedding_dim=512,
|
||||||
gst_num_heads=4,
|
gst_num_heads=4,
|
||||||
|
@ -47,20 +48,33 @@ class Tacotron2(TacotronAbstract):
|
||||||
ddc_r, gst)
|
ddc_r, gst)
|
||||||
|
|
||||||
# init layer dims
|
# init layer dims
|
||||||
speaker_embedding_dim = 512 if num_speakers > 1 else 0
|
decoder_in_features = 512
|
||||||
gst_embedding_dim = gst_embedding_dim if self.gst else 0
|
encoder_in_features = 512
|
||||||
decoder_in_features = 512+speaker_embedding_dim+gst_embedding_dim
|
|
||||||
encoder_in_features = 512 if num_speakers > 1 else 512
|
if speaker_embedding_dim is None:
|
||||||
proj_speaker_dim = 80 if num_speakers > 1 else 0
|
# if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim
|
||||||
|
self.embeddings_per_sample = False
|
||||||
|
speaker_embedding_dim = 512
|
||||||
|
else:
|
||||||
|
# if speaker_embedding_dim is not None we need use speaker embedding per sample
|
||||||
|
self.embeddings_per_sample = True
|
||||||
|
|
||||||
|
# speaker and gst embeddings is concat in decoder input
|
||||||
|
if num_speakers > 1:
|
||||||
|
decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim
|
||||||
|
if self.gst:
|
||||||
|
decoder_in_features = decoder_in_features + gst_embedding_dim # add gst embedding dim
|
||||||
|
|
||||||
# embedding layer
|
# embedding layer
|
||||||
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
|
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
|
||||||
|
|
||||||
# speaker embedding layer
|
# speaker embedding layer
|
||||||
if num_speakers > 1:
|
if num_speakers > 1:
|
||||||
|
if not self.embeddings_per_sample:
|
||||||
self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
|
self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
|
||||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||||
|
|
||||||
|
# base model layers
|
||||||
self.encoder = Encoder(encoder_in_features)
|
self.encoder = Encoder(encoder_in_features)
|
||||||
self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
|
self.decoder = Decoder(decoder_in_features, self.decoder_output_dim, r, attn_type, attn_win,
|
||||||
attn_norm, prenet_type, prenet_dropout,
|
attn_norm, prenet_type, prenet_dropout,
|
||||||
|
@ -91,7 +105,7 @@ class Tacotron2(TacotronAbstract):
|
||||||
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
||||||
return mel_outputs, mel_outputs_postnet, alignments
|
return mel_outputs, mel_outputs_postnet, alignments
|
||||||
|
|
||||||
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
|
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None, speaker_embeddings=None):
|
||||||
# compute mask for padding
|
# compute mask for padding
|
||||||
# B x T_in_max (boolean)
|
# B x T_in_max (boolean)
|
||||||
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
|
||||||
|
@ -105,8 +119,13 @@ class Tacotron2(TacotronAbstract):
|
||||||
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
|
||||||
|
|
||||||
if self.num_speakers > 1:
|
if self.num_speakers > 1:
|
||||||
embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
|
if not self.embeddings_per_sample:
|
||||||
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
|
# B x 1 x speaker_embed_dim
|
||||||
|
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||||
|
else:
|
||||||
|
# B x 1 x speaker_embed_dim
|
||||||
|
speaker_embeddings = torch.unsqueeze(speaker_embeddings, 1)
|
||||||
|
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||||
|
|
||||||
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
|
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
|
||||||
|
|
||||||
|
@ -134,23 +153,18 @@ class Tacotron2(TacotronAbstract):
|
||||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def inference(self, text, speaker_ids=None, style_mel=None):
|
def inference(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None):
|
||||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||||
encoder_outputs = self.encoder.inference(embedded_inputs)
|
encoder_outputs = self.encoder.inference(embedded_inputs)
|
||||||
|
|
||||||
|
if self.gst:
|
||||||
|
# B x gst_dim
|
||||||
|
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
||||||
|
|
||||||
if self.num_speakers > 1:
|
if self.num_speakers > 1:
|
||||||
embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
|
if not self.embeddings_per_sample:
|
||||||
embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
|
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||||
if self.gst:
|
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||||
# B x gst_dim
|
|
||||||
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
|
||||||
encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
|
|
||||||
else:
|
|
||||||
encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
|
|
||||||
else:
|
|
||||||
if self.gst:
|
|
||||||
# B x gst_dim
|
|
||||||
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
|
||||||
|
|
||||||
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
|
decoder_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||||
encoder_outputs)
|
encoder_outputs)
|
||||||
|
@ -160,25 +174,21 @@ class Tacotron2(TacotronAbstract):
|
||||||
decoder_outputs, postnet_outputs, alignments)
|
decoder_outputs, postnet_outputs, alignments)
|
||||||
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
return decoder_outputs, postnet_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
def inference_truncated(self, text, speaker_ids=None, style_mel=None):
|
def inference_truncated(self, text, speaker_ids=None, style_mel=None, speaker_embeddings=None):
|
||||||
"""
|
"""
|
||||||
Preserve model states for continuous inference
|
Preserve model states for continuous inference
|
||||||
"""
|
"""
|
||||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||||
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
|
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
|
||||||
|
|
||||||
|
if self.gst:
|
||||||
|
# B x gst_dim
|
||||||
|
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
||||||
|
|
||||||
if self.num_speakers > 1:
|
if self.num_speakers > 1:
|
||||||
embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
|
if not self.embeddings_per_sample:
|
||||||
embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
|
speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
|
||||||
if self.gst:
|
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
|
||||||
# B x gst_dim
|
|
||||||
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
|
||||||
else:
|
|
||||||
encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
|
|
||||||
else:
|
|
||||||
if self.gst:
|
|
||||||
# B x gst_dim
|
|
||||||
encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
|
|
||||||
|
|
||||||
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
|
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
|
||||||
encoder_outputs)
|
encoder_outputs)
|
||||||
|
|
Loading…
Reference in New Issue