diff --git a/.compute b/.compute index 24578189..1a93820d 100644 --- a/.compute +++ b/.compute @@ -10,7 +10,7 @@ wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh python3 setup.py develop # cp -R ${USER_DIR}/GermanData ../tmp/ -# python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ +python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ python3 distribute.py --config_path config.json --data_path /data/rw/home/LibriTTS/train-clean-360 diff --git a/config.json b/config.json index 4d56c3dc..741b82ac 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { "run_name": "ljspeech", - "run_description": "gradual training with prenet frame size 1 + no maxout for cbhg + symmetric norm.", + "run_description": "Tacotron2", "audio":{ // Audio processing parameters @@ -31,7 +31,7 @@ "reinit_layers": [], - "model": "Tacotron", // one of the model in models/ + "model": "Tacotron2", // one of the model in models/ "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. @@ -55,10 +55,10 @@ "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":16, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. + "gradual_training": [[0, 7, 32], [1, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 10000, // Number of training steps expected to save traning stats and checkpoints. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. "print_step": 25, // Number of steps to log traning on console. "batch_group_size": 0, //Number of batches to shuffle after bucketing. diff --git a/layers/tacotron.py b/layers/tacotron.py index 411e7e72..04781031 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -406,7 +406,7 @@ class Decoder(nn.Module): self.memory_input = new_memory[:, :self.memory_size * self.memory_dim] else: # use only the last frame prediction - self.memory_input = new_memory[:, :self.memory_dim] + self.memory_input = new_memory[:, self.memory_dim * (self.r - 1):] def forward(self, inputs, memory, mask, speaker_embeddings=None): """ diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 358d1807..c87ffc78 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -101,6 +101,7 @@ class Decoder(nn.Module): forward_attn_mask, location_attn, separate_stopnet): super(Decoder, self).__init__() self.mel_channels = inputs_dim + self.r_init = r self.r = r self.encoder_embedding_dim = in_features self.separate_stopnet = separate_stopnet @@ -111,8 +112,7 @@ class Decoder(nn.Module): self.gate_threshold = 0.5 self.p_attention_dropout = 0.1 self.p_decoder_dropout = 0.1 - - self.prenet = Prenet(self.mel_channels * r, prenet_type, + self.prenet = Prenet(self.mel_channels, prenet_type, prenet_dropout, [self.prenet_dim, self.prenet_dim], bias=False) @@ -135,44 +135,34 @@ class Decoder(nn.Module): self.decoder_rnn_dim, 1) self.linear_projection = Linear(self.decoder_rnn_dim + in_features, - self.mel_channels * r) + self.mel_channels * self.r_init) self.stopnet = nn.Sequential( nn.Dropout(0.1), Linear( - self.decoder_rnn_dim + self.mel_channels * r, + self.decoder_rnn_dim + self.mel_channels * self.r_init, 1, bias=True, init_gain='sigmoid')) - - self.attention_rnn_init = nn.Embedding(1, self.query_dim) - self.go_frame_init = nn.Embedding(1, self.mel_channels * r) - self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim) self.memory_truncated = None + def set_r(self, new_r): + self.r = new_r + def get_go_frame(self, inputs): B = inputs.size(0) - memory = self.go_frame_init(inputs.data.new_zeros(B).long()) + memory = torch.zeros(B, self.mel_channels * self.r, device=inputs.device) return memory def _init_states(self, inputs, mask, keep_states=False): B = inputs.size(0) # T = inputs.size(1) - if not keep_states: - self.query = self.attention_rnn_init( - inputs.data.new_zeros(B).long()) - self.attention_rnn_cell_state = Variable( - inputs.data.new(B, self.query_dim).zero_()) - - self.decoder_hidden = self.decoder_rnn_inits( - inputs.data.new_zeros(B).long()) - self.decoder_cell = Variable( - inputs.data.new(B, self.decoder_rnn_dim).zero_()) - - self.context = Variable( - inputs.data.new(B, self.encoder_embedding_dim).zero_()) - + self.query = torch.zeros(B, self.query_dim, device=inputs.device) + self.attention_rnn_cell_state = torch.zeros(B, self.query_dim, device=inputs.device) + self.decoder_hidden = torch.zeros(B, self.decoder_rnn_dim, device=inputs.device) + self.decoder_cell = torch.zeros(B, self.decoder_rnn_dim, device=inputs.device) + self.context = torch.zeros(B, self.encoder_embedding_dim, device=inputs.device) self.inputs = inputs self.processed_inputs = self.attention.inputs_layer(inputs) self.mask = mask @@ -192,6 +182,9 @@ class Decoder(nn.Module): outputs = outputs.transpose(1, 2) return outputs, stop_tokens, alignments + def _update_memory(self, memory): + return memory[:, :, self.mel_channels * (self.r - 1) :] + def decode(self, memory): query_input = torch.cat((memory, self.context), -1) self.query, self.attention_rnn_cell_state = self.attention_rnn( @@ -223,13 +216,14 @@ class Decoder(nn.Module): stop_token = self.stopnet(stopnet_input.detach()) else: stop_token = self.stopnet(stopnet_input) + decoder_output = decoder_output[:, :self.r * self.mel_channels] return decoder_output, stop_token, self.attention.attention_weights def forward(self, inputs, memories, mask): memory = self.get_go_frame(inputs).unsqueeze(0) memories = self._reshape_memory(memories) memories = torch.cat((memory, memories), dim=0) - memories = self.prenet(memories) + memories = self.prenet(self._update_memory(memories)) self._init_states(inputs, mask=mask) self.attention.init_states(inputs) @@ -277,7 +271,7 @@ class Decoder(nn.Module): print(" | > Decoder stopped with 'max_decoder_steps") break - memory = mel_output + memory = self._update_memory(mel_output) t += 1 outputs, stop_tokens, alignments = self._parse_outputs( diff --git a/train.py b/train.py index 13444c82..d8cdf1fb 100644 --- a/train.py +++ b/train.py @@ -62,7 +62,7 @@ def setup_loader(ap, is_val=False, verbose=False): dataset = MyDataset( c.r, c.text_cleaner, - meta_data=meta_data_eval if is_val else meta_data_train, + meta_data=meta_data_eval if is_val else meta_data_train[:64], ap=ap, batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, min_seq_len=c.min_seq_len,