bugfix in DDC now DDC work on Tacotron1

This commit is contained in:
Edresson 2020-07-28 17:11:32 -03:00 committed by erogol
parent 70c665b9c4
commit be77e24a39
5 changed files with 52 additions and 53 deletions

View File

@ -1,5 +1,5 @@
{ {
"model": "Tacotron2", "model": "Tacotron",
"run_name": "ljspeech-ddc-bn", "run_name": "ljspeech-ddc-bn",
"run_description": "tacotron2 with ddc and batch-normalization", "run_description": "tacotron2 with ddc and batch-normalization",
@ -114,7 +114,7 @@
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING // DATA LOADING
"text_cleaner": "phoneme_cleaners", "text_cleaner": "portuguese_cleaners",
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes. "num_val_loader_workers": 4, // number of evaluation data loader processes.
@ -131,9 +131,13 @@
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
// MULTI-SPEAKER and GST // MULTI-SPEAKER and GST
<<<<<<< HEAD:mozilla_voice_tts/tts/configs/config.json
"use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning. "use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning.
"use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 "use_external_speaker_embedding_file": false, // if true, forces the model to use external embedding per sample instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
"external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558 "external_speaker_embedding_file": "../../speakers-vctk-en.json", // if not null and use_external_speaker_embedding_file is true, it is used to load a specific embedding file and thus uses these embeddings instead of nn.embeddings, that is, it supports external embeddings such as those used at: https://arxiv.org/abs /1806.04558
=======
"use_speaker_embedding": true, // use speaker embedding to enable multi-speaker learning.
>>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/configs/config.json
"use_gst": true, // use global style tokens "use_gst": true, // use global style tokens
"gst": { // gst parameter if gst is enabled "gst": { // gst parameter if gst is enabled
"gst_style_input": null, // Condition the style input either on a "gst_style_input": null, // Condition the style input either on a

View File

@ -42,19 +42,13 @@ class Tacotron(TacotronAbstract):
bidirectional_decoder, double_decoder_consistency, bidirectional_decoder, double_decoder_consistency,
ddc_r, gst) ddc_r, gst)
# init layer dims # init layer dims
decoder_in_features = 256 decoder_in_features = 256
encoder_in_features = 256 encoder_in_features = 256
speaker_embedding_dim = 256
proj_speaker_dim = 80 if num_speakers > 1 else 0
if speaker_embedding_dim is None:
# if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim
self.embeddings_per_sample = False
speaker_embedding_dim = 256
else:
# if speaker_embedding_dim is not None we need use speaker embedding per sample
self.embeddings_per_sample = True
# speaker and gst embeddings is concat in decoder input
if num_speakers > 1: if num_speakers > 1:
decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim decoder_in_features = decoder_in_features + speaker_embedding_dim # add speaker embedding dim
if self.gst: if self.gst:
@ -109,6 +103,9 @@ class Tacotron(TacotronAbstract):
input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths) input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
# B x T_in x embed_dim # B x T_in x embed_dim
inputs = self.embedding(characters) inputs = self.embedding(characters)
# B x speaker_embed_dim
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
# B x T_in x encoder_in_features # B x T_in x encoder_in_features
encoder_outputs = self.encoder(inputs) encoder_outputs = self.encoder(inputs)
# sequence masking # sequence masking
@ -155,6 +152,9 @@ class Tacotron(TacotronAbstract):
@torch.no_grad() @torch.no_grad()
def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None): def inference(self, characters, speaker_ids=None, style_mel=None, speaker_embeddings=None):
inputs = self.embedding(characters) inputs = self.embedding(characters)
self._init_states()
if speaker_ids is not None:
self.compute_speaker_embedding(speaker_ids)
encoder_outputs = self.encoder(inputs) encoder_outputs = self.encoder(inputs)
if self.gst: if self.gst:
# B x gst_dim # B x gst_dim

View File

@ -1,9 +1,15 @@
import torch import torch
from torch import nn from torch import nn
<<<<<<< HEAD:mozilla_voice_tts/tts/models/tacotron2.py
from mozilla_voice_tts.tts.layers.gst_layers import GST from mozilla_voice_tts.tts.layers.gst_layers import GST
from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet from mozilla_voice_tts.tts.layers.tacotron2 import Decoder, Encoder, Postnet
from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract from mozilla_voice_tts.tts.models.tacotron_abstract import TacotronAbstract
=======
from TTS.tts.layers.gst_layers import GST
from TTS.tts.layers.tacotron2 import Decoder, Encoder, Postnet
from TTS.tts.models.tacotron_abstract import TacotronAbstract
>>>>>>> bugfix in DDC now DDC work on Tacotron1:TTS/tts/models/tacotron2.py
# TODO: match function arguments with tacotron # TODO: match function arguments with tacotron
class Tacotron2(TacotronAbstract): class Tacotron2(TacotronAbstract):
@ -85,24 +91,6 @@ class Tacotron2(TacotronAbstract):
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
return mel_outputs, mel_outputs_postnet, alignments return mel_outputs, mel_outputs_postnet, alignments
def compute_gst(self, inputs, style_input):
""" Compute global style token """
device = inputs.device
if isinstance(style_input, dict):
query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device)
_GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
for k_token, v_amplifier in style_input.items():
key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
elif style_input is None:
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
else:
gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable
embedded_gst = gst_outputs.repeat(1, inputs.size(1), 1)
return inputs, embedded_gst
def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None): def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, speaker_ids=None):
# compute mask for padding # compute mask for padding
# B x T_in_max (boolean) # B x T_in_max (boolean)
@ -112,20 +100,13 @@ class Tacotron2(TacotronAbstract):
# B x T_in_max x D_en # B x T_in_max x D_en
encoder_outputs = self.encoder(embedded_inputs, text_lengths) encoder_outputs = self.encoder(embedded_inputs, text_lengths)
if self.gst:
# B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
if self.num_speakers > 1: if self.num_speakers > 1:
embedded_speakers = self.speaker_embedding(speaker_ids)[:, None] embedded_speakers = self.speaker_embedding(speaker_ids)[:, None]
embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
if self.gst:
# B x gst_dim
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs)
encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
else:
encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
else:
if self.gst:
# B x gst_dim
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, mel_specs)
encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
@ -162,15 +143,14 @@ class Tacotron2(TacotronAbstract):
embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
if self.gst: if self.gst:
# B x gst_dim # B x gst_dim
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1) encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
else: else:
encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
else: else:
if self.gst: if self.gst:
# B x gst_dim # B x gst_dim
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
decoder_outputs, alignments, stop_tokens = self.decoder.inference( decoder_outputs, alignments, stop_tokens = self.decoder.inference(
encoder_outputs) encoder_outputs)
@ -192,15 +172,13 @@ class Tacotron2(TacotronAbstract):
embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1) embedded_speakers = embedded_speakers.repeat(1, encoder_outputs.size(1), 1)
if self.gst: if self.gst:
# B x gst_dim # B x gst_dim
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
encoder_outputs = torch.cat([encoder_outputs, embedded_gst, embedded_speakers], dim=-1)
else: else:
encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1) encoder_outputs = torch.cat([encoder_outputs, embedded_speakers], dim=-1)
else: else:
if self.gst: if self.gst:
# B x gst_dim # B x gst_dim
encoder_outputs, embedded_gst = self.compute_gst(encoder_outputs, style_mel) encoder_outputs = self.compute_gst(encoder_outputs, style_mel)
encoder_outputs = torch.cat([encoder_outputs, embedded_gst], dim=-1)
mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated( mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(
encoder_outputs) encoder_outputs)

View File

@ -164,11 +164,22 @@ class TacotronAbstract(ABC, nn.Module):
self.speaker_embeddings_projected = self.speaker_project_mel( self.speaker_embeddings_projected = self.speaker_project_mel(
self.speaker_embeddings).squeeze(1) self.speaker_embeddings).squeeze(1)
def compute_gst(self, inputs, mel_specs): def compute_gst(self, inputs, style_input):
""" Compute global style token """ """ Compute global style token """
# pylint: disable=not-callable device = inputs.device
gst_outputs = self.gst_layer(mel_specs) if isinstance(style_input, dict):
inputs = self._add_speaker_embedding(inputs, gst_outputs) query = torch.zeros(1, 1, self.gst_embedding_dim//2).to(device)
_GST = torch.tanh(self.gst_layer.style_token_layer.style_tokens)
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
for k_token, v_amplifier in style_input.items():
key = _GST[int(k_token)].unsqueeze(0).expand(1, -1, -1)
gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
elif style_input is None:
gst_outputs = torch.zeros(1, 1, self.gst_embedding_dim).to(device)
else:
gst_outputs = self.gst_layer(style_input) # pylint: disable=not-callable
inputs = self._concat_speaker_embedding(inputs, gst_outputs)
return inputs return inputs
@staticmethod @staticmethod

View File

@ -265,6 +265,12 @@ def check_config(c):
check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10) check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10)
check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000) check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000)
check_argument('gst', c, restricted=True, val_type=dict)
check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict])
check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000)
check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10)
check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000)
# datasets - checking only the first entry # datasets - checking only the first entry
check_argument('datasets', c, restricted=True, val_type=list) check_argument('datasets', c, restricted=True, val_type=list)
for dataset_entry in c['datasets']: for dataset_entry in c['datasets']: