From d47ba5d310377026f612bbe480c88be117c565a1 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sat, 20 Jul 2019 12:33:21 +0200 Subject: [PATCH 01/57] gradual traning with memory queue --- config_libritts.json | 10 ++++----- layers/tacotron.py | 50 +++++++++++++++++++++++++------------------- models/tacotron.py | 2 +- train.py | 21 +++++++++++++++++++ 4 files changed, 55 insertions(+), 28 deletions(-) diff --git a/config_libritts.json b/config_libritts.json index f9a752ec..22f03dd4 100644 --- a/config_libritts.json +++ b/config_libritts.json @@ -1,6 +1,6 @@ { "run_name": "libritts-360", - "run_description": "LibriTTS 360 clean with multi speaker embedding.", + "run_description": "LibriTTS 360 gradual traning with memory queue.", "audio":{ // Audio processing parameters @@ -31,13 +31,13 @@ "reinit_layers": [], - "model": "Tacotron2", // one of the model in models/ + "model": "Tacotron", // one of the model in models/ "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. + "memory_size": 7, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. @@ -52,9 +52,9 @@ "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "batch_size": 24, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. "eval_batch_size":16, - "r": 1, // Number of frames to predict for step. + "r": 7, // Number of frames to predict for step. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. diff --git a/layers/tacotron.py b/layers/tacotron.py index b71ddbc3..aa6ca4a6 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -270,12 +270,14 @@ class Decoder(nn.Module): memory_size (int): size of the past window. if <= 0 memory_size = r TODO: arguments """ + # Pylint gets confused by PyTorch conventions here #pylint: disable=attribute-defined-outside-init def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing, attn_norm, prenet_type, prenet_dropout, forward_attn, - trans_agent, forward_attn_mask, location_attn, separate_stopnet): + trans_agent, forward_attn_mask, location_attn, + separate_stopnet): super(Decoder, self).__init__() self.r = r self.in_features = in_features @@ -291,17 +293,18 @@ class Decoder(nn.Module): out_features=[256, 128]) # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State self.attention_rnn = nn.GRUCell(in_features + 128, 256) - self.attention_layer = Attention(attention_rnn_dim=256, - embedding_dim=in_features, - attention_dim=128, - location_attention=location_attn, - attention_location_n_filters=32, - attention_location_kernel_size=31, - windowing=attn_windowing, - norm=attn_norm, - forward_attn=forward_attn, - trans_agent=trans_agent, - forward_attn_mask=forward_attn_mask) + self.attention_layer = Attention( + attention_rnn_dim=256, + embedding_dim=in_features, + attention_dim=128, + location_attention=location_attn, + attention_location_n_filters=32, + attention_location_kernel_size=31, + windowing=attn_windowing, + norm=attn_norm, + forward_attn=forward_attn, + trans_agent=trans_agent, + forward_attn_mask=forward_attn_mask) # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input self.project_to_decoder_in = nn.Linear(256 + in_features, 256) # decoder_RNN_input -> |RNN| -> RNN_state @@ -324,6 +327,9 @@ class Decoder(nn.Module): self.proj_to_mel.weight, gain=torch.nn.init.calculate_gain('linear')) + def _set_r(self, new_r): + self.r = new_r + def _reshape_memory(self, memory): """ Reshape the spectrograms for given 'r' @@ -371,8 +377,11 @@ class Decoder(nn.Module): # Prenet processed_memory = self.prenet(self.memory_input) # Attention RNN - self.attention_rnn_hidden = self.attention_rnn(torch.cat((processed_memory, self.current_context_vec), -1), self.attention_rnn_hidden) - self.current_context_vec = self.attention_layer(self.attention_rnn_hidden, inputs, self.processed_inputs, mask) + self.attention_rnn_hidden = self.attention_rnn( + torch.cat((processed_memory, self.current_context_vec), -1), + self.attention_rnn_hidden) + self.current_context_vec = self.attention_layer( + self.attention_rnn_hidden, inputs, self.processed_inputs, mask) # Concat RNN output and attention context vector decoder_input = self.project_to_decoder_in( torch.cat((self.attention_rnn_hidden, self.current_context_vec), @@ -395,17 +404,14 @@ class Decoder(nn.Module): stop_token = self.stopnet(stopnet_input.detach()) else: stop_token = self.stopnet(stopnet_input) + output = output[:, : self.r * self.memory_dim] return output, stop_token, self.attention_layer.attention_weights def _update_memory_queue(self, new_memory): - if self.memory_size > 0 and new_memory.shape[-1] < self.memory_size: - self.memory_input = torch.cat([ - self.memory_input[:, self.r * self.memory_dim:].clone(), - new_memory - ], - dim=-1) - else: - self.memory_input = new_memory + self.memory_input = torch.cat([ + self.memory_input[:, self.r * self.memory_dim:].clone(), new_memory + ], + dim=-1) def forward(self, inputs, memory, mask): """ diff --git a/models/tacotron.py b/models/tacotron.py index b7f40683..18c74904 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -39,7 +39,7 @@ class Tacotron(nn.Module): self.last_linear = nn.Sequential( nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim), nn.Sigmoid()) - + def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): B = characters.size(0) mask = sequence_mask(text_lengths).to(characters.device) diff --git a/train.py b/train.py index 815a0a32..730d7389 100644 --- a/train.py +++ b/train.py @@ -81,6 +81,20 @@ def setup_loader(ap, is_val=False, verbose=False): return loader +def gradual_training_scheduler(global_step): + if global_step < 10000: + r, batch_size = 7, 32 + elif global_step < 50000: + r, batch_size = 5, 32 + elif global_step < 130000: + r, batch_size = 3, 32 + elif global_step < 290000: + r, batch_size = 2, 16 + else: + r, batch_size = 1, 16 + return r, batch_size + + def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) @@ -524,7 +538,14 @@ def main(args): #pylint: disable=redefined-outer-name if 'best_loss' not in locals(): best_loss = float('inf') + current_step = 0 for epoch in range(0, c.epochs): + # set gradual training + r, c.batch_size = gradual_training_scheduler(current_step) + c.r = r + model.decoder._set_r(r) + print(" > Number of outputs per iteration:", model.decoder.r) + train_loss, current_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch) From 721e781216e782722b2603fbed8b5e2fe265e0c1 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Sun, 21 Jul 2019 16:32:56 +0200 Subject: [PATCH 02/57] config update --- config.json | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/config.json b/config.json index 17b9207f..ea65c9f0 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { - "run_name": "mozilla-no-loc-fattn-stopnet-sigmoid-loss_masking", - "run_description": "using forward attention, with original prenet, loss masking,separate stopnet, sigmoid. Compare this with 4817. Pytorch DPP", + "run_name": "ljspeech", + "run_description": "gradual training with prenet frame size 1. Comparing to memory queue in gradual training. ", "audio":{ // Audio processing parameters @@ -31,43 +31,45 @@ "reinit_layers": [], - "model": "Tacotron2", // one of the model in models/ + "model": "Tacotron", // one of the model in models/ "grad_clip": 1, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "windowing": false, // Enables attention windowing. Used only in eval mode. - "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. - "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":16, - "r": 1, // Number of frames to predict for step. + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 10, // Number of steps to log traning on console. + "save_step": 10000, // Number of training steps expected to save traning stats and checkpoints. + "print_step": 25, // Number of steps to log traning on console. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "run_eval": true, "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - "data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument - "meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader. - "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader. - "dataset": "mozilla", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py - "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training + "data_path": "/home/erogol/Data/LJSpeech-1.1/", // DATASET-RELATED: can overwritten from command argument + "meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader. + "meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader. + "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. @@ -76,6 +78,6 @@ "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "text_cleaner": "phoneme_cleaners", - "use_speaker_embedding": false // whether to use additional embeddings for separate speakers + "use_speaker_embedding": false } From f4eaec12648addb76b56e0c74faa3ddb4c60cad9 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 22 Jul 2019 02:06:26 +0200 Subject: [PATCH 03/57] compute update --- .compute | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.compute b/.compute index 63dea7a7..002f9136 100644 --- a/.compute +++ b/.compute @@ -10,7 +10,7 @@ wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh python3 setup.py develop # cp -R ${USER_DIR}/GermanData ../tmp/ -python3 distribute.py --config_path config_libritts.json --data_path /data/rw/home/LibriTTS/train-clean-360/ +python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ while true; do sleep 1000000; done From f038b1aa3fd670b406d7ad6898a613e4870c6c60 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 22 Jul 2019 02:10:21 +0200 Subject: [PATCH 04/57] new way of handling memory queue and enable/disable queuing in right/wrong conditions --- layers/tacotron.py | 54 +++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index aa6ca4a6..474c1646 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -279,15 +279,17 @@ class Decoder(nn.Module): trans_agent, forward_attn_mask, location_attn, separate_stopnet): super(Decoder, self).__init__() + self.r_init = r self.r = r self.in_features = in_features self.max_decoder_steps = 500 + self.use_memory_queue = memory_size > 0 self.memory_size = memory_size if memory_size > 0 else r self.memory_dim = memory_dim self.separate_stopnet = separate_stopnet # memory -> |Prenet| -> processed_memory self.prenet = Prenet( - memory_dim * self.memory_size, + memory_dim * self.memory_size if self.use_memory_queue else memory_dim, prenet_type, prenet_dropout, out_features=[256, 128]) @@ -311,21 +313,9 @@ class Decoder(nn.Module): self.decoder_rnns = nn.ModuleList( [nn.GRUCell(256, 256) for _ in range(2)]) # RNN_state -> |Linear| -> mel_spec - self.proj_to_mel = nn.Linear(256, memory_dim * r) + self.proj_to_mel = nn.Linear(256, memory_dim * self.r_init) # learn init values instead of zero init. - self.attention_rnn_init = nn.Embedding(1, 256) - self.memory_init = nn.Embedding(1, self.memory_size * memory_dim) - self.decoder_rnn_inits = nn.Embedding(2, 256) - self.stopnet = StopNet(256 + memory_dim * r) - # self.init_layers() - - def init_layers(self): - torch.nn.init.xavier_uniform_( - self.project_to_decoder_in.weight, - gain=torch.nn.init.calculate_gain('linear')) - torch.nn.init.xavier_uniform_( - self.proj_to_mel.weight, - gain=torch.nn.init.calculate_gain('linear')) + self.stopnet = StopNet(256 + memory_dim * self.r_init) def _set_r(self, new_r): self.r = new_r @@ -350,13 +340,14 @@ class Decoder(nn.Module): B = inputs.size(0) T = inputs.size(1) # go frame as zeros matrix - self.memory_input = self.memory_init(inputs.data.new_zeros(B).long()) - + if self.use_memory_queue: + self.memory_input = torch.zeros(B, self.memory_dim * self.memory_size, device=inputs.device) + else: + self.memory_input = torch.zeros(B, self.memory_dim, device=inputs.device) # decoder states - self.attention_rnn_hidden = self.attention_rnn_init( - inputs.data.new_zeros(B).long()) + self.attention_rnn_hidden = torch.zeros(B, 256, device=inputs.device) self.decoder_rnn_hiddens = [ - self.decoder_rnn_inits(inputs.data.new_tensor([idx] * B).long()) + torch.zeros(B, 256, device=inputs.device) for idx in range(len(self.decoder_rnns)) ] self.current_context_vec = inputs.data.new(B, self.in_features).zero_() @@ -407,11 +398,20 @@ class Decoder(nn.Module): output = output[:, : self.r * self.memory_dim] return output, stop_token, self.attention_layer.attention_weights - def _update_memory_queue(self, new_memory): - self.memory_input = torch.cat([ - self.memory_input[:, self.r * self.memory_dim:].clone(), new_memory - ], - dim=-1) + def _update_memory_input(self, new_memory): + if self.use_memory_queue: + if self.memory_size > self.r: + # memory queue size is larger than number of frames per decoder iter + self.memory_input = torch.cat([ + self.memory_input[:, self.r * self.memory_dim:].clone(), new_memory + ], + dim=-1) + else: + # memory queue size smaller than number of frames per decoder iter + self.memory_input = new_memory[:, (self.r - self.memory_size)*self.memory_dim:] + else: + # use only the last frame prediction + self.memory_input = new_memory[:, (self.r-1) * self.memory_dim:] def forward(self, inputs, memory, mask): """ @@ -437,7 +437,7 @@ class Decoder(nn.Module): while len(outputs) < memory.size(0): if t > 0: new_memory = memory[t - 1] - self._update_memory_queue(new_memory) + self._update_memory_input(new_memory) output, stop_token, attention = self.decode(inputs, mask) outputs += [output] attentions += [attention] @@ -464,7 +464,7 @@ class Decoder(nn.Module): while True: if t > 0: new_memory = outputs[-1] - self._update_memory_queue(new_memory) + self._update_memory_input(new_memory) output, stop_token, attention = self.decode(inputs, None) stop_token = torch.sigmoid(stop_token.data) outputs += [output] From ee706b50f68896cfb20f5163455f09af364cf449 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 22 Jul 2019 02:11:20 +0200 Subject: [PATCH 05/57] enalbe graudal training by config.json --- train.py | 46 +++++++++++++++++++----------------------- utils/generic_utils.py | 7 +++++++ 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/train.py b/train.py index 730d7389..591cbd76 100644 --- a/train.py +++ b/train.py @@ -20,7 +20,7 @@ from utils.generic_utils import (NoamLR, check_update, count_parameters, load_config, remove_experiment_folder, save_best_model, save_checkpoint, weight_decay, set_init_dict, copy_config_file, setup_model, - split_dataset) + split_dataset, gradual_training_scheduler) from utils.logger import Logger from utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers @@ -81,20 +81,6 @@ def setup_loader(ap, is_val=False, verbose=False): return loader -def gradual_training_scheduler(global_step): - if global_step < 10000: - r, batch_size = 7, 32 - elif global_step < 50000: - r, batch_size = 5, 32 - elif global_step < 130000: - r, batch_size = 3, 32 - elif global_step < 290000: - r, batch_size = 2, 16 - else: - r, batch_size = 1, 16 - return r, batch_size - - def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) @@ -106,8 +92,10 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, avg_decoder_loss = 0 avg_stop_loss = 0 avg_step_time = 0 + avg_loader_time = 0 print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) + end_time = time.time() for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -121,6 +109,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, stop_targets = data[6] avg_text_length = torch.mean(text_lengths.float()) avg_spec_length = torch.mean(mel_lengths.float()) + loader_time = time.time() - end_time if c.use_speaker_embedding: speaker_ids = [speaker_mapping[speaker_name] @@ -191,17 +180,16 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, else: grad_norm_st = 0 - step_time = time.time() - start_time - epoch_time += step_time - if current_step % c.print_step == 0: print( " | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} " "DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} " - "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}".format( + "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " + "LoaderTime:{:.2f} LR:{:.6f}".format( num_iter, batch_n_iter, current_step, loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item(), - grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, current_lr), + grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, + loader_time, current_lr), flush=True) # aggregate losses from processes @@ -216,6 +204,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, avg_decoder_loss += float(decoder_loss.item()) avg_stop_loss += stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()) avg_step_time += step_time + avg_loader_time += loader_time # Plot Training Iter Stats iter_stats = {"loss_posnet": postnet_loss.item(), @@ -254,11 +243,15 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, {'TrainAudio': train_audio}, c.audio["sample_rate"]) + step_time = end_time - start_time + epoch_time += step_time + avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss avg_step_time /= (num_iter + 1) + avg_loader_time /= (num_iter + 1) # print epoch stats print( @@ -267,7 +260,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStepTime:{:.2f}".format(current_step, avg_total_loss, avg_postnet_loss, avg_decoder_loss, - avg_stop_loss, epoch_time, avg_step_time), + avg_stop_loss, epoch_time, avg_step_time, + avg_loader_time), flush=True) # Plot Epoch Stats @@ -281,6 +275,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, if c.tb_model_param_stats: tb_logger.tb_model_weights(model, current_step) + end_time = time.time() return avg_postnet_loss, current_step @@ -541,9 +536,10 @@ def main(args): #pylint: disable=redefined-outer-name current_step = 0 for epoch in range(0, c.epochs): # set gradual training - r, c.batch_size = gradual_training_scheduler(current_step) - c.r = r - model.decoder._set_r(r) + if c.gradual_training is not None: + r, c.batch_size = gradual_training_scheduler(current_step, c) + c.r = r + model.decoder._set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) train_loss, current_step = train(model, criterion, criterion_st, @@ -592,7 +588,7 @@ if __name__ == '__main__': '--output_folder', type=str, default='', - help='folder name for traning outputs.' + help='folder name for training outputs.' ) # DISTRUBUTED diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 64414765..8a64dbae 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -305,3 +305,10 @@ def split_dataset(items): else: return items[:eval_split_size], items[eval_split_size:] + +def gradual_training_scheduler(global_step, config): + new_values = None + for values in config.gradual_training: + if global_step >= values[0]: + new_values = values + return new_values[1], new_values[2] \ No newline at end of file From 2bbb3f7a400b92780451caf12d6a4c7729b5302d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 22 Jul 2019 15:09:05 +0200 Subject: [PATCH 06/57] don't use sigmoid output for tacotron, fix bug for memory queue handling, remove maxout --- config.json | 8 ++++---- layers/tacotron.py | 12 ++++-------- models/tacotron.py | 4 +--- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/config.json b/config.json index ea65c9f0..ae29c287 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { "run_name": "ljspeech", - "run_description": "gradual training with prenet frame size 1. Comparing to memory queue in gradual training. ", + "run_description": "gradual training with prenet frame size 1 + no maxout for cbhg + symmetric norm.", "audio":{ // Audio processing parameters @@ -16,8 +16,8 @@ "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. // Normalization parameters "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": false, // move normalization to range [-1, 1] - "max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! @@ -71,7 +71,7 @@ "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length - "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. + "output_path": "/media/erogol/data_ssd/Models/libri_tts/", // DATASET-RELATED: output path for all training outputs. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. diff --git a/layers/tacotron.py b/layers/tacotron.py index 474c1646..1cc32a0b 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -135,9 +135,6 @@ class CBHG(nn.Module): ]) # max pooling of conv bank, with padding # TODO: try average pooling OR larger kernel size - self.max_pool1d = nn.Sequential( - nn.ConstantPad1d([0, 1], value=0), - nn.MaxPool1d(kernel_size=2, stride=1, padding=0)) out_features = [K * conv_bank_features] + conv_projections[:-1] activations = [self.relu] * (len(conv_projections) - 1) activations += [None] @@ -186,7 +183,6 @@ class CBHG(nn.Module): outs.append(out) x = torch.cat(outs, dim=1) assert x.size(1) == self.conv_bank_features * len(self.conv1d_banks) - x = self.max_pool1d(x) for conv1d in self.conv1d_projections: x = conv1d(x) # (B, T_in, hid_feature) @@ -387,7 +383,7 @@ class Decoder(nn.Module): del decoder_input # predict mel vectors from decoder vectors output = self.proj_to_mel(decoder_output) - output = torch.sigmoid(output) + # output = torch.sigmoid(output) # predict stop token stopnet_input = torch.cat([decoder_output, output], -1) del decoder_output @@ -403,15 +399,15 @@ class Decoder(nn.Module): if self.memory_size > self.r: # memory queue size is larger than number of frames per decoder iter self.memory_input = torch.cat([ - self.memory_input[:, self.r * self.memory_dim:].clone(), new_memory + new_memory, self.memory_input[:, :(self.memory_size - self.r) * self.memory_dim].clone() ], dim=-1) else: # memory queue size smaller than number of frames per decoder iter - self.memory_input = new_memory[:, (self.r - self.memory_size)*self.memory_dim:] + self.memory_input = new_memory[:, :self.memory_size * self.memory_dim] else: # use only the last frame prediction - self.memory_input = new_memory[:, (self.r-1) * self.memory_dim:] + self.memory_input = new_memory[:, :self.memory_dim] def forward(self, inputs, memory, mask): """ diff --git a/models/tacotron.py b/models/tacotron.py index 18c74904..bf312db4 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -36,9 +36,7 @@ class Tacotron(nn.Module): forward_attn, trans_agent, forward_attn_mask, location_attn, separate_stopnet) self.postnet = PostCBHG(mel_dim) - self.last_linear = nn.Sequential( - nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim), - nn.Sigmoid()) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim) def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): B = characters.size(0) From 1827f77752c279e489a9fcee44f68d28b32d7412 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 22 Jul 2019 15:10:06 +0200 Subject: [PATCH 07/57] demo server update --- server/conf.json | 13 +++++++------ server/synthesizer.py | 17 ++++++++++++++--- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/server/conf.json b/server/conf.json index ba8d5016..6341596d 100644 --- a/server/conf.json +++ b/server/conf.json @@ -1,11 +1,12 @@ { - "tts_path":"/media/erogol/data_ssd/Data/models/ljspeech_models/ljspeech-April-08-2019_07+32PM-8a47b46/", // tts model root folder - "tts_file":"checkpoint_261000.pth.tar", // tts checkpoint file + "tts_path":"/media/erogol/data_ssd/Models/libri_tts/ljspeech-July-22-2019_10+45AM-ee706b5/", // tts model root folder + "tts_file":"best_model.pth.tar", // tts checkpoint file "tts_config":"config.json", // tts config.json file - "wavernn_lib_path": "/home/erogol/projects/", // Rootpath to wavernn project folder to be important. If this is none, model uses GL for speech synthesis. - "wavernn_path":"/media/erogol/data_ssd/Data/models/wavernn/ljspeech/mold_ljspeech_best_model/", // wavernn model root path - "wavernn_file":"checkpoint_433000.pth.tar", // wavernn checkpoint file name - "wavernn_config":"config.json", // wavernn config file + "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. + "wavernn_lib_path": "/home/erogol/projects/", // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. + "wavernn_path":"/media/erogol/data_ssd/Models/wavernn/universal/4910/", // wavernn model root path + "wavernn_file":"best_model_16K.pth.tar", // wavernn checkpoint file name + "wavernn_config":"config_16K.json", // wavernn config file "is_wavernn_batched":true, "port": 5002, "use_cuda": true, diff --git a/server/synthesizer.py b/server/synthesizer.py index 29895b73..bdfd8c6c 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -8,6 +8,7 @@ import sys from utils.audio import AudioProcessor from utils.generic_utils import load_config, setup_model from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme +from utils.speakers import load_speaker_mapping import re alphabets = r"([A-Za-z])" @@ -44,7 +45,13 @@ class Synthesizer(object): else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) - self.tts_model = setup_model(self.input_size, c=self.tts_config) #FIXME: missing num_speakers argument to setup_model + # load speakers + if self.config.tts_speakers is not None: + self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) + num_speakers = len(self.tts_speakers) + else: + num_speakers = 0 + self.tts_model = setup_model(self.input_size, num_speakers=num_speakers , c=self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) @@ -58,6 +65,7 @@ class Synthesizer(object): self.tts_model.decoder.max_decoder_steps = 3000 def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): + # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) @@ -70,8 +78,11 @@ class Synthesizer(object): rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, - pad=2, - upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset + mulaw=self.wavernn_config.mulaw, + pad=self.wavernn_config.pad, + use_aux_net=self.wavernn_config.use_aux_net, + use_upsample_net = self.wavernn_config.use_upsample_net, + upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, From 713b3df7924850713384f8574aa84a36dcdb70df Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 22 Jul 2019 15:10:19 +0200 Subject: [PATCH 08/57] prompt data loade time per iteartion --- train.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/train.py b/train.py index 591cbd76..bbc5ff77 100644 --- a/train.py +++ b/train.py @@ -179,6 +179,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, optimizer_st.step() else: grad_norm_st = 0 + + step_time = time.time() - start_time + epoch_time += step_time if current_step % c.print_step == 0: print( @@ -242,9 +245,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, tb_logger.tb_train_audios(current_step, {'TrainAudio': train_audio}, c.audio["sample_rate"]) - - step_time = end_time - start_time - epoch_time += step_time + end_time = time.time() avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) @@ -274,8 +275,6 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, tb_logger.tb_train_epoch_stats(current_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, current_step) - - end_time = time.time() return avg_postnet_loss, current_step From 78c3897599d415d9312f57d61fde715feacef595 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 09:47:52 +0200 Subject: [PATCH 09/57] root path speaker matching added data root path in speaker matching for mailabs, this way you don't need to start at the very bottom of the folder hierarchy if you want to explicitly define metafiles. --- datasets/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 9dd7a610..a3701c4d 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -82,14 +82,14 @@ def mailabs(root_path, meta_files=None): # meta_files = [f.strip() for f in meta_files.split(",")] items = [] for idx, csv_file in enumerate(csv_files): + txt_file = os.path.join(root_path, csv_file) # determine speaker based on folder structure... - speaker_name_match = speaker_regex.search(csv_file) + speaker_name_match = speaker_regex.search(txt_file) if speaker_name_match is None: continue speaker_name = speaker_name_match.group("speaker_name") print(" | > {}".format(csv_file)) folder = folders[idx] - txt_file = os.path.join(root_path, csv_file) with open(txt_file, 'r') as ttf: for line in ttf: cols = line.split('|') From 537879482dc31592f51fb8cae79919615091d49a Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 13:31:10 +0200 Subject: [PATCH 10/57] fixed config comment strings for attention parameters --- config.json | 12 ++++++------ config_libritts.json | 10 +++++----- config_tacotron.json | 4 ++-- config_tacotron2.json | 10 +++++----- config_tacotron_de.json | 12 ++++++------ config_tacotron_gst.json | 4 ++-- 6 files changed, 26 insertions(+), 26 deletions(-) diff --git a/config.json b/config.json index 807c4c60..24d26e16 100644 --- a/config.json +++ b/config.json @@ -40,12 +40,12 @@ "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. - "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. + "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": false, // enable_disable location sensitive attention. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_libritts.json b/config_libritts.json index f9a752ec..5579e565 100644 --- a/config_libritts.json +++ b/config_libritts.json @@ -39,13 +39,13 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. + "use_forward_attn": false, // enable/disable forward attention. In general, it aligns faster. "forward_attn_mask": false, - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_tacotron.json b/config_tacotron.json index 127a4b3d..92ee3909 100644 --- a/config_tacotron.json +++ b/config_tacotron.json @@ -42,10 +42,10 @@ "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // "original" or "bn". "prenet_dropout": true, // enable/disable dropout at prenet. - "use_forward_attn": true, // if it uses forward attention. In general, it aligns faster. + "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster. "forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well. "transition_agent": true, // enable/disable transition agent of forward attention. - "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "location_attn": false, // enable_disable location sensitive attention. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_tacotron2.json b/config_tacotron2.json index fd188d20..02b4341b 100644 --- a/config_tacotron2.json +++ b/config_tacotron2.json @@ -39,12 +39,12 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. - "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. + "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster. "forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well. - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_tacotron_de.json b/config_tacotron_de.json index 834bfed4..fc3efbec 100644 --- a/config_tacotron_de.json +++ b/config_tacotron_de.json @@ -40,12 +40,12 @@ "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. - "use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "forward_attn_mask": false, - "location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. + "use_forward_attn": false, // enable/disable forward attention. In general, it aligns faster. + "transition_agent": false, // enable/disable transition agent of forward attention. + "forward_attn_mask": false, // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. diff --git a/config_tacotron_gst.json b/config_tacotron_gst.json index 98fafa54..5a0f2c09 100644 --- a/config_tacotron_gst.json +++ b/config_tacotron_gst.json @@ -42,8 +42,8 @@ "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "prenet_type": "original", // "original" or "bn". "prenet_dropout": true, // enable/disable dropout at prenet. - "use_forward_attn": true, // if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well. + "use_forward_attn": true, // enable/disable forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Apply forward attention mask at inference to prevent bad modes. Try it if your model does not align well. "transition_agent": false, // enable/disable transition agent of forward attention. "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. From 4c9fbeeaf81c2df8461fe5f35225ae9ecd0728a9 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 14:23:36 +0200 Subject: [PATCH 11/57] simplified folder variable --- datasets/preprocess.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index a3701c4d..a86f8e5d 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -75,21 +75,19 @@ def mailabs(root_path, meta_files=None): speaker_regex = re.compile("by_book/(male|female)/(?P[^/]+)/") if meta_files is None: csv_files = glob(root_path+"/**/metadata.csv", recursive=True) - folders = [os.path.dirname(f) for f in csv_files] else: csv_files = meta_files - folders = [f.strip().split("by_book")[1][1:] for f in csv_files] # meta_files = [f.strip() for f in meta_files.split(",")] items = [] for idx, csv_file in enumerate(csv_files): txt_file = os.path.join(root_path, csv_file) + folder = os.path.dirname(txt_file) # determine speaker based on folder structure... speaker_name_match = speaker_regex.search(txt_file) if speaker_name_match is None: continue speaker_name = speaker_name_match.group("speaker_name") print(" | > {}".format(csv_file)) - folder = folders[idx] with open(txt_file, 'r') as ttf: for line in ttf: cols = line.split('|') From d4045fd47b1dc5939d6100c9f8a2faf3863fc1fc Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 14:30:06 +0200 Subject: [PATCH 12/57] unused var --- datasets/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index a86f8e5d..e5f4e1a2 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -79,7 +79,7 @@ def mailabs(root_path, meta_files=None): csv_files = meta_files # meta_files = [f.strip() for f in meta_files.split(",")] items = [] - for idx, csv_file in enumerate(csv_files): + for csv_file in csv_files: txt_file = os.path.join(root_path, csv_file) folder = os.path.dirname(txt_file) # determine speaker based on folder structure... From 98edb7a4f8d7e99ab7dcdca036d27762f27e4dd9 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 18:38:09 +0200 Subject: [PATCH 13/57] renamed attention_rnn to query_rnn --- layers/common_layers.py | 24 +++++++------- layers/tacotron.py | 57 ++++++++++++++++---------------- layers/tacotron2.py | 72 ++++++++++++++++++++--------------------- 3 files changed, 79 insertions(+), 74 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index 2edf0dab..77ce4f4a 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -108,19 +108,19 @@ class LocationLayer(nn.Module): class Attention(nn.Module): # Pylint gets confused by PyTorch conventions here #pylint: disable=attribute-defined-outside-init - def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, + def __init__(self, query_dim, embedding_dim, attention_dim, location_attention, attention_location_n_filters, attention_location_kernel_size, windowing, norm, forward_attn, trans_agent, forward_attn_mask): super(Attention, self).__init__() self.query_layer = Linear( - attention_rnn_dim, attention_dim, bias=False, init_gain='tanh') + query_dim, attention_dim, bias=False, init_gain='tanh') self.inputs_layer = Linear( embedding_dim, attention_dim, bias=False, init_gain='tanh') self.v = Linear(attention_dim, 1, bias=True) if trans_agent: self.ta = nn.Linear( - attention_rnn_dim + embedding_dim, 1, bias=True) + query_dim + embedding_dim, 1, bias=True) if location_attention: self.location_layer = LocationLayer( attention_dim, @@ -203,11 +203,12 @@ class Attention(nn.Module): def apply_forward_attention(self, inputs, alignment, query): # forward attention - prev_alpha = F.pad(self.alpha[:, :-1].clone(), - (1, 0, 0, 0)).to(inputs.device) + prev_alpha = F.pad(self.alpha[:, :-1].clone().to(inputs.device), + (1, 0, 0, 0)) # compute transition potentials - alpha = (((1 - self.u) * self.alpha.clone().to(inputs.device) + - self.u * prev_alpha) + 1e-8) * alignment + alpha = ((1 - self.u) * self.alpha + + self.u * prev_alpha + + 1e-8) * alignment # force incremental alignment if not self.training and self.forward_attn_mask: _, n = prev_alpha.max(1) @@ -231,19 +232,20 @@ class Attention(nn.Module): self.u = torch.sigmoid(self.ta(ta_input)) return context, self.alpha - def forward(self, attention_hidden_state, inputs, processed_inputs, mask): + def forward(self, query, inputs, processed_inputs, mask): if self.location_attention: attention, processed_query = self.get_location_attention( - attention_hidden_state, processed_inputs) + query, processed_inputs) else: attention, processed_query = self.get_attention( - attention_hidden_state, processed_inputs) + query, processed_inputs) # apply masking if mask is not None: attention.data.masked_fill_(1 - mask, self._mask_value) # apply windowing - only in eval mode if not self.training and self.windowing: attention = self.apply_windowing(attention, inputs) + # normalize attention values if self.norm == "softmax": alignment = torch.softmax(attention, dim=-1) @@ -258,7 +260,7 @@ class Attention(nn.Module): # apply forward attention if enabled if self.forward_attn: context, self.attention_weights = self.apply_forward_attention( - inputs, alignment, attention_hidden_state) + inputs, alignment, query) else: context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) diff --git a/layers/tacotron.py b/layers/tacotron.py index b71ddbc3..068ae7cc 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -283,6 +283,7 @@ class Decoder(nn.Module): self.memory_size = memory_size if memory_size > 0 else r self.memory_dim = memory_dim self.separate_stopnet = separate_stopnet + self.query_dim = 256 # memory -> |Prenet| -> processed_memory self.prenet = Prenet( memory_dim * self.memory_size, @@ -290,18 +291,18 @@ class Decoder(nn.Module): prenet_dropout, out_features=[256, 128]) # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State - self.attention_rnn = nn.GRUCell(in_features + 128, 256) - self.attention_layer = Attention(attention_rnn_dim=256, - embedding_dim=in_features, - attention_dim=128, - location_attention=location_attn, - attention_location_n_filters=32, - attention_location_kernel_size=31, - windowing=attn_windowing, - norm=attn_norm, - forward_attn=forward_attn, - trans_agent=trans_agent, - forward_attn_mask=forward_attn_mask) + self.query_rnn = nn.GRUCell(in_features + 128, self.query_dim) + self.attention = Attention(query_dim=self.query_dim, + embedding_dim=in_features, + attention_dim=128, + location_attention=location_attn, + attention_location_n_filters=32, + attention_location_kernel_size=31, + windowing=attn_windowing, + norm=attn_norm, + forward_attn=forward_attn, + trans_agent=trans_agent, + forward_attn_mask=forward_attn_mask) # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input self.project_to_decoder_in = nn.Linear(256 + in_features, 256) # decoder_RNN_input -> |RNN| -> RNN_state @@ -310,7 +311,7 @@ class Decoder(nn.Module): # RNN_state -> |Linear| -> mel_spec self.proj_to_mel = nn.Linear(256, memory_dim * r) # learn init values instead of zero init. - self.attention_rnn_init = nn.Embedding(1, 256) + self.query_rnn_init = nn.Embedding(1, 256) self.memory_init = nn.Embedding(1, self.memory_size * memory_dim) self.decoder_rnn_inits = nn.Embedding(2, 256) self.stopnet = StopNet(256 + memory_dim * r) @@ -347,18 +348,18 @@ class Decoder(nn.Module): self.memory_input = self.memory_init(inputs.data.new_zeros(B).long()) # decoder states - self.attention_rnn_hidden = self.attention_rnn_init( + self.query = self.query_rnn_init( inputs.data.new_zeros(B).long()) self.decoder_rnn_hiddens = [ self.decoder_rnn_inits(inputs.data.new_tensor([idx] * B).long()) for idx in range(len(self.decoder_rnns)) ] - self.current_context_vec = inputs.data.new(B, self.in_features).zero_() + self.context_vec = inputs.data.new(B, self.in_features).zero_() # attention states self.attention = inputs.data.new(B, T).zero_() self.attention_cum = inputs.data.new(B, T).zero_() # cache attention inputs - self.processed_inputs = self.attention_layer.inputs_layer(inputs) + self.processed_inputs = self.attention.inputs_layer(inputs) def _parse_outputs(self, outputs, attentions, stop_tokens): # Back to batch first @@ -370,13 +371,15 @@ class Decoder(nn.Module): def decode(self, inputs, mask=None): # Prenet processed_memory = self.prenet(self.memory_input) + # Attention RNN - self.attention_rnn_hidden = self.attention_rnn(torch.cat((processed_memory, self.current_context_vec), -1), self.attention_rnn_hidden) - self.current_context_vec = self.attention_layer(self.attention_rnn_hidden, inputs, self.processed_inputs, mask) - # Concat RNN output and attention context vector + self.query = self.query_rnn(torch.cat((processed_memory, self.context_vec), -1), self.query) + self.context_vec = self.attention(self.query, inputs, self.processed_inputs, mask) + + # Concat query and attention context vector decoder_input = self.project_to_decoder_in( - torch.cat((self.attention_rnn_hidden, self.current_context_vec), - -1)) + torch.cat((self.query, self.context_vec), -1)) + # Pass through the decoder RNNs for idx in range(len(self.decoder_rnns)): self.decoder_rnn_hiddens[idx] = self.decoder_rnns[idx]( @@ -384,18 +387,18 @@ class Decoder(nn.Module): # Residual connection decoder_input = self.decoder_rnn_hiddens[idx] + decoder_input decoder_output = decoder_input - del decoder_input + # predict mel vectors from decoder vectors output = self.proj_to_mel(decoder_output) output = torch.sigmoid(output) + # predict stop token stopnet_input = torch.cat([decoder_output, output], -1) - del decoder_output if self.separate_stopnet: stop_token = self.stopnet(stopnet_input.detach()) else: stop_token = self.stopnet(stopnet_input) - return output, stop_token, self.attention_layer.attention_weights + return output, stop_token, self.attention.attention_weights def _update_memory_queue(self, new_memory): if self.memory_size > 0 and new_memory.shape[-1] < self.memory_size: @@ -427,7 +430,7 @@ class Decoder(nn.Module): stop_tokens = [] t = 0 self._init_states(inputs) - self.attention_layer.init_states(inputs) + self.attention.init_states(inputs) while len(outputs) < memory.size(0): if t > 0: new_memory = memory[t - 1] @@ -453,8 +456,8 @@ class Decoder(nn.Module): stop_tokens = [] t = 0 self._init_states(inputs) - self.attention_layer.init_win_idx() - self.attention_layer.init_states(inputs) + self.attention.init_win_idx() + self.attention.init_states(inputs) while True: if t > 0: new_memory = outputs[-1] diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 802f158e..ba52abe2 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -104,7 +104,7 @@ class Decoder(nn.Module): self.r = r self.encoder_embedding_dim = in_features self.separate_stopnet = separate_stopnet - self.attention_rnn_dim = 1024 + self.query_dim = 1024 self.decoder_rnn_dim = 1024 self.prenet_dim = 256 self.max_decoder_steps = 1000 @@ -116,22 +116,22 @@ class Decoder(nn.Module): prenet_dropout, [self.prenet_dim, self.prenet_dim], bias=False) - self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features, - self.attention_rnn_dim) + self.query_rnn = nn.LSTMCell(self.prenet_dim + in_features, + self.query_dim) - self.attention_layer = Attention(attention_rnn_dim=self.attention_rnn_dim, - embedding_dim=in_features, - attention_dim=128, - location_attention=location_attn, - attention_location_n_filters=32, - attention_location_kernel_size=31, - windowing=attn_win, - norm=attn_norm, - forward_attn=forward_attn, - trans_agent=trans_agent, - forward_attn_mask=forward_attn_mask) + self.attention = Attention(query_dim=self.query_dim, + embedding_dim=in_features, + attention_dim=128, + location_attention=location_attn, + attention_location_n_filters=32, + attention_location_kernel_size=31, + windowing=attn_win, + norm=attn_norm, + forward_attn=forward_attn, + trans_agent=trans_agent, + forward_attn_mask=forward_attn_mask) - self.decoder_rnn = nn.LSTMCell(self.attention_rnn_dim + in_features, + self.decoder_rnn = nn.LSTMCell(self.query_dim + in_features, self.decoder_rnn_dim, 1) self.linear_projection = Linear(self.decoder_rnn_dim + in_features, @@ -145,7 +145,7 @@ class Decoder(nn.Module): bias=True, init_gain='sigmoid')) - self.attention_rnn_init = nn.Embedding(1, self.attention_rnn_dim) + self.query_rnn_init = nn.Embedding(1, self.query_dim) self.go_frame_init = nn.Embedding(1, self.mel_channels * r) self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim) self.memory_truncated = None @@ -160,10 +160,10 @@ class Decoder(nn.Module): # T = inputs.size(1) if not keep_states: - self.attention_hidden = self.attention_rnn_init( + self.query = self.query_rnn_init( inputs.data.new_zeros(B).long()) - self.attention_cell = Variable( - inputs.data.new(B, self.attention_rnn_dim).zero_()) + self.query_rnn_cell_state = Variable( + inputs.data.new(B, self.query_dim).zero_()) self.decoder_hidden = self.decoder_rnn_inits( inputs.data.new_zeros(B).long()) @@ -174,7 +174,7 @@ class Decoder(nn.Module): inputs.data.new(B, self.encoder_embedding_dim).zero_()) self.inputs = inputs - self.processed_inputs = self.attention_layer.inputs_layer(inputs) + self.processed_inputs = self.attention.inputs_layer(inputs) self.mask = mask def _reshape_memory(self, memories): @@ -193,18 +193,18 @@ class Decoder(nn.Module): return outputs, stop_tokens, alignments def decode(self, memory): - cell_input = torch.cat((memory, self.context), -1) - self.attention_hidden, self.attention_cell = self.attention_rnn( - cell_input, (self.attention_hidden, self.attention_cell)) - self.attention_hidden = F.dropout( - self.attention_hidden, self.p_attention_dropout, self.training) - self.attention_cell = F.dropout( - self.attention_cell, self.p_attention_dropout, self.training) + query_input = torch.cat((memory, self.context), -1) + self.query, self.query_rnn_cell_state = self.query_rnn( + query_input, (self.query, self.query_rnn_cell_state)) + self.query = F.dropout( + self.query, self.p_attention_dropout, self.training) + self.query_rnn_cell_state = F.dropout( + self.query_rnn_cell_state, self.p_attention_dropout, self.training) - self.context = self.attention_layer(self.attention_hidden, self.inputs, - self.processed_inputs, self.mask) + self.context = self.attention(self.query, self.inputs, + self.processed_inputs, self.mask) - memory = torch.cat((self.attention_hidden, self.context), -1) + memory = torch.cat((self.query, self.context), -1) self.decoder_hidden, self.decoder_cell = self.decoder_rnn( memory, (self.decoder_hidden, self.decoder_cell)) self.decoder_hidden = F.dropout(self.decoder_hidden, @@ -223,7 +223,7 @@ class Decoder(nn.Module): stop_token = self.stopnet(stopnet_input.detach()) else: stop_token = self.stopnet(stopnet_input) - return decoder_output, stop_token, self.attention_layer.attention_weights + return decoder_output, stop_token, self.attention.attention_weights def forward(self, inputs, memories, mask): memory = self.get_go_frame(inputs).unsqueeze(0) @@ -232,7 +232,7 @@ class Decoder(nn.Module): memories = self.prenet(memories) self._init_states(inputs, mask=mask) - self.attention_layer.init_states(inputs) + self.attention.init_states(inputs) outputs, stop_tokens, alignments = [], [], [] while len(outputs) < memories.size(0) - 1: @@ -251,8 +251,8 @@ class Decoder(nn.Module): memory = self.get_go_frame(inputs) self._init_states(inputs, mask=None) - self.attention_layer.init_win_idx() - self.attention_layer.init_states(inputs) + self.attention.init_win_idx() + self.attention.init_states(inputs) outputs, stop_tokens, alignments, t = [], [], [], 0 stop_flags = [True, False, False] @@ -295,8 +295,8 @@ class Decoder(nn.Module): else: self._init_states(inputs, mask=None, keep_states=True) - self.attention_layer.init_win_idx() - self.attention_layer.init_states(inputs) + self.attention.init_win_idx() + self.attention.init_states(inputs) outputs, stop_tokens, alignments, t = [], [], [], 0 stop_flags = [True, False, False] stop_count = 0 From 82db35530f327afbaea99f249a23369faba30513 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 19:33:56 +0200 Subject: [PATCH 14/57] unused var --- layers/common_layers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index 77ce4f4a..bc353be3 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -234,10 +234,10 @@ class Attention(nn.Module): def forward(self, query, inputs, processed_inputs, mask): if self.location_attention: - attention, processed_query = self.get_location_attention( + attention, _ = self.get_location_attention( query, processed_inputs) else: - attention, processed_query = self.get_attention( + attention, _ = self.get_attention( query, processed_inputs) # apply masking if mask is not None: From fb7c5b1996532ccc5b214b9b6e462c574b037dbd Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Tue, 23 Jul 2019 20:02:31 +0200 Subject: [PATCH 15/57] unused instance vars --- layers/tacotron.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index 068ae7cc..31d6cd84 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -355,9 +355,6 @@ class Decoder(nn.Module): for idx in range(len(self.decoder_rnns)) ] self.context_vec = inputs.data.new(B, self.in_features).zero_() - # attention states - self.attention = inputs.data.new(B, T).zero_() - self.attention_cum = inputs.data.new(B, T).zero_() # cache attention inputs self.processed_inputs = self.attention.inputs_layer(inputs) From a6118564d578f314dfa787a80cc288dba2228dfa Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Wed, 24 Jul 2019 11:46:34 +0200 Subject: [PATCH 16/57] renamed query_rnn back to attention_rnn --- layers/tacotron.py | 12 +++++++----- layers/tacotron2.py | 18 +++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index 31d6cd84..40225fa5 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -291,7 +291,9 @@ class Decoder(nn.Module): prenet_dropout, out_features=[256, 128]) # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State - self.query_rnn = nn.GRUCell(in_features + 128, self.query_dim) + # attention_rnn generates queries for the attention mechanism + self.attention_rnn = nn.GRUCell(in_features + 128, self.query_dim) + self.attention = Attention(query_dim=self.query_dim, embedding_dim=in_features, attention_dim=128, @@ -311,7 +313,7 @@ class Decoder(nn.Module): # RNN_state -> |Linear| -> mel_spec self.proj_to_mel = nn.Linear(256, memory_dim * r) # learn init values instead of zero init. - self.query_rnn_init = nn.Embedding(1, 256) + self.attention_rnn_init = nn.Embedding(1, 256) self.memory_init = nn.Embedding(1, self.memory_size * memory_dim) self.decoder_rnn_inits = nn.Embedding(2, 256) self.stopnet = StopNet(256 + memory_dim * r) @@ -348,7 +350,7 @@ class Decoder(nn.Module): self.memory_input = self.memory_init(inputs.data.new_zeros(B).long()) # decoder states - self.query = self.query_rnn_init( + self.query = self.attention_rnn_init( inputs.data.new_zeros(B).long()) self.decoder_rnn_hiddens = [ self.decoder_rnn_inits(inputs.data.new_tensor([idx] * B).long()) @@ -369,8 +371,8 @@ class Decoder(nn.Module): # Prenet processed_memory = self.prenet(self.memory_input) - # Attention RNN - self.query = self.query_rnn(torch.cat((processed_memory, self.context_vec), -1), self.query) + # Attention + self.query = self.attention_rnn(torch.cat((processed_memory, self.context_vec), -1), self.query) self.context_vec = self.attention(self.query, inputs, self.processed_inputs, mask) # Concat query and attention context vector diff --git a/layers/tacotron2.py b/layers/tacotron2.py index ba52abe2..358d1807 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -116,8 +116,8 @@ class Decoder(nn.Module): prenet_dropout, [self.prenet_dim, self.prenet_dim], bias=False) - self.query_rnn = nn.LSTMCell(self.prenet_dim + in_features, - self.query_dim) + self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features, + self.query_dim) self.attention = Attention(query_dim=self.query_dim, embedding_dim=in_features, @@ -145,7 +145,7 @@ class Decoder(nn.Module): bias=True, init_gain='sigmoid')) - self.query_rnn_init = nn.Embedding(1, self.query_dim) + self.attention_rnn_init = nn.Embedding(1, self.query_dim) self.go_frame_init = nn.Embedding(1, self.mel_channels * r) self.decoder_rnn_inits = nn.Embedding(1, self.decoder_rnn_dim) self.memory_truncated = None @@ -160,9 +160,9 @@ class Decoder(nn.Module): # T = inputs.size(1) if not keep_states: - self.query = self.query_rnn_init( + self.query = self.attention_rnn_init( inputs.data.new_zeros(B).long()) - self.query_rnn_cell_state = Variable( + self.attention_rnn_cell_state = Variable( inputs.data.new(B, self.query_dim).zero_()) self.decoder_hidden = self.decoder_rnn_inits( @@ -194,12 +194,12 @@ class Decoder(nn.Module): def decode(self, memory): query_input = torch.cat((memory, self.context), -1) - self.query, self.query_rnn_cell_state = self.query_rnn( - query_input, (self.query, self.query_rnn_cell_state)) + self.query, self.attention_rnn_cell_state = self.attention_rnn( + query_input, (self.query, self.attention_rnn_cell_state)) self.query = F.dropout( self.query, self.p_attention_dropout, self.training) - self.query_rnn_cell_state = F.dropout( - self.query_rnn_cell_state, self.p_attention_dropout, self.training) + self.attention_rnn_cell_state = F.dropout( + self.attention_rnn_cell_state, self.p_attention_dropout, self.training) self.context = self.attention(self.query, self.inputs, self.processed_inputs, self.mask) From 40f56f9b000bb03384ebe883c03380b260a6a205 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Wed, 24 Jul 2019 11:47:06 +0200 Subject: [PATCH 17/57] simplified code for fwd attn --- layers/common_layers.py | 42 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index bc353be3..bfdd6775 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -201,17 +201,17 @@ class Attention(nn.Module): self.win_idx = torch.argmax(attention, 1).long()[0].item() return attention - def apply_forward_attention(self, inputs, alignment, query): + def apply_forward_attention(self, alignment): # forward attention - prev_alpha = F.pad(self.alpha[:, :-1].clone().to(inputs.device), - (1, 0, 0, 0)) + fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device), + (1, 0, 0, 0)) # compute transition potentials alpha = ((1 - self.u) * self.alpha - + self.u * prev_alpha + + self.u * fwd_shifted_alpha + 1e-8) * alignment # force incremental alignment if not self.training and self.forward_attn_mask: - _, n = prev_alpha.max(1) + _, n = fwd_shifted_alpha.max(1) val, n2 = alpha.max(1) for b in range(alignment.shape[0]): alpha[b, n[b] + 3:] = 0 @@ -221,16 +221,9 @@ class Attention(nn.Module): alpha[b, (n[b] - 2 )] = 0.01 * val[b] # smoothing factor for the prev step - # compute attention weights - self.alpha = alpha / alpha.sum(dim=1).unsqueeze(1) - # compute context - context = torch.bmm(self.alpha.unsqueeze(1), inputs) - context = context.squeeze(1) - # compute transition agent - if self.trans_agent: - ta_input = torch.cat([context, query.squeeze(1)], dim=-1) - self.u = torch.sigmoid(self.ta(ta_input)) - return context, self.alpha + # renormalize attention weights + alpha = alpha / alpha.sum(dim=1, keepdim=True) + return alpha def forward(self, query, inputs, processed_inputs, mask): if self.location_attention: @@ -254,15 +247,20 @@ class Attention(nn.Module): attention).sum( dim=1, keepdim=True) else: - raise RuntimeError("Unknown value for attention norm type") + raise ValueError("Unknown value for attention norm type") if self.location_attention: self.update_location_attention(alignment) # apply forward attention if enabled if self.forward_attn: - context, self.attention_weights = self.apply_forward_attention( - inputs, alignment, query) - else: - context = torch.bmm(alignment.unsqueeze(1), inputs) - context = context.squeeze(1) - self.attention_weights = alignment + alignment = self.apply_forward_attention(alignment) + self.alpha = alignment + + context = torch.bmm(alignment.unsqueeze(1), inputs) + context = context.squeeze(1) + self.attention_weights = alignment + + # compute transition agent + if self.forward_attn and self.trans_agent: + ta_input = torch.cat([context, query.squeeze(1)], dim=-1) + self.u = torch.sigmoid(self.ta(ta_input)) return context From f3dac0aa840a893f7222b6444d5bc7f5f40d623d Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Wed, 24 Jul 2019 11:49:07 +0200 Subject: [PATCH 18/57] updating location attn after calculating fwd attention --- layers/common_layers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index bfdd6775..a652b8a6 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -248,13 +248,14 @@ class Attention(nn.Module): dim=1, keepdim=True) else: raise ValueError("Unknown value for attention norm type") - if self.location_attention: - self.update_location_attention(alignment) # apply forward attention if enabled if self.forward_attn: alignment = self.apply_forward_attention(alignment) self.alpha = alignment + if self.location_attention: + self.update_location_attention(alignment) + context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) self.attention_weights = alignment From 4a23354d3c6c9941eb401ebfc4f4b5f154fb51af Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Wed, 24 Jul 2019 12:17:08 +0200 Subject: [PATCH 19/57] stylewav for testing inference --- config_tacotron_gst.json | 3 ++- train.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/config_tacotron_gst.json b/config_tacotron_gst.json index 98fafa54..3c872730 100644 --- a/config_tacotron_gst.json +++ b/config_tacotron_gst.json @@ -77,6 +77,7 @@ "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "text_cleaner": "phoneme_cleaners", - "use_speaker_embedding": false // whether to use additional embeddings for separate speakers + "use_speaker_embedding": false, // whether to use additional embeddings for separate speakers + "style_wav_for_test": null // path to wav for styling the inference tests when using GST } \ No newline at end of file diff --git a/train.py b/train.py index 815a0a32..c893cb36 100644 --- a/train.py +++ b/train.py @@ -409,11 +409,13 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch): test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None + style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, - speaker_id=speaker_id) + speaker_id=speaker_id, + style_wav=style_wav) file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, From ab42396fbfbd647f8f7f67f660250d9f75219643 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Thu, 25 Jul 2019 13:04:41 +0200 Subject: [PATCH 20/57] undo loc attn after fwd attn --- layers/common_layers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index a652b8a6..0a7216ef 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -248,14 +248,15 @@ class Attention(nn.Module): dim=1, keepdim=True) else: raise ValueError("Unknown value for attention norm type") + + if self.location_attention: + self.update_location_attention(alignment) + # apply forward attention if enabled if self.forward_attn: alignment = self.apply_forward_attention(alignment) self.alpha = alignment - if self.location_attention: - self.update_location_attention(alignment) - context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) self.attention_weights = alignment From 215eb014ca67b6945eaffaa6f8c0c5cfc9453536 Mon Sep 17 00:00:00 2001 From: Thomas Werkmeister Date: Fri, 26 Jul 2019 13:40:58 +0200 Subject: [PATCH 21/57] enforce list append semantic; prevents numpy add --- datasets/TTSDataset.py | 1 + utils/text/__init__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index ecf8e9ea..0305da7f 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -102,6 +102,7 @@ class MyDataset(Dataset): cache_path) if self.enable_eos_bos: phonemes = pad_with_eos_bos(phonemes) + phonemes = np.asarray(phonemes, dtype=np.int32) return phonemes diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 332163d2..77cc23a5 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -47,7 +47,7 @@ def text2phone(text, language): def pad_with_eos_bos(phoneme_sequence): - return [_PHONEMES_TO_ID[_bos]] + phoneme_sequence + [_PHONEMES_TO_ID[_eos]] + return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]] def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False): From bea9701d93cd338f6fd3d5be8ff3f59ab06f779d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 13 Aug 2019 11:53:56 +0200 Subject: [PATCH 22/57] change the computation of the global step --- train.py | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/train.py b/train.py index bbc5ff77..565713ed 100644 --- a/train.py +++ b/train.py @@ -82,7 +82,7 @@ def setup_loader(ap, is_val=False, verbose=False): def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, - ap, epoch): + ap, global_step, epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) @@ -123,8 +123,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) - current_step = num_iter + args.restore_step + \ - epoch * len(data_loader) + 1 + global_step += 1 # setup lr if c.lr_decay: @@ -183,13 +182,13 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, step_time = time.time() - start_time epoch_time += step_time - if current_step % c.print_step == 0: + if global_step % c.print_step == 0: print( " | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} " "DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( - num_iter, batch_n_iter, current_step, loss.item(), + num_iter, batch_n_iter, global_step, loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item(), grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, loader_time, current_lr), @@ -216,13 +215,13 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "grad_norm": grad_norm, "grad_norm_st": grad_norm_st, "step_time": step_time} - tb_logger.tb_train_iter_stats(current_step, iter_stats) + tb_logger.tb_train_iter_stats(global_step, iter_stats) - if current_step % c.save_step == 0: + if global_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, optimizer_st, - postnet_loss.item(), OUT_PATH, current_step, + postnet_loss.item(), OUT_PATH, global_step, epoch) # Diagnostic visualizations @@ -235,14 +234,14 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } - tb_logger.tb_train_figures(current_step, figures) + tb_logger.tb_train_figures(global_step, figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: train_audio = ap.inv_spectrogram(const_spec.T) else: train_audio = ap.inv_mel_spectrogram(const_spec.T) - tb_logger.tb_train_audios(current_step, + tb_logger.tb_train_audios(global_step, {'TrainAudio': train_audio}, c.audio["sample_rate"]) end_time = time.time() @@ -259,7 +258,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, " | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " - "AvgStepTime:{:.2f}".format(current_step, avg_total_loss, + "AvgStepTime:{:.2f}".format(global_step, avg_total_loss, avg_postnet_loss, avg_decoder_loss, avg_stop_loss, epoch_time, avg_step_time, avg_loader_time), @@ -272,13 +271,13 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss, "epoch_time": epoch_time} - tb_logger.tb_train_epoch_stats(current_step, epoch_stats) + tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: - tb_logger.tb_model_weights(model, current_step) - return avg_postnet_loss, current_step + tb_logger.tb_model_weights(model, global_step) + return avg_postnet_loss, global_step -def evaluate(model, criterion, criterion_st, ap, current_step, epoch): +def evaluate(model, criterion, criterion_st, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=True) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) @@ -391,14 +390,14 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch): "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } - tb_logger.tb_eval_figures(current_step, eval_figures) + tb_logger.tb_eval_figures(global_step, eval_figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) - tb_logger.tb_eval_audios(current_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) + tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # compute average losses avg_postnet_loss /= (num_iter + 1) @@ -409,7 +408,7 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch): epoch_stats = {"loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss} - tb_logger.tb_eval_stats(current_step, epoch_stats) + tb_logger.tb_eval_stats(global_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: # test sentences @@ -422,7 +421,7 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch): wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id) - file_path = os.path.join(AUDIO_PATH, str(current_step)) + file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) @@ -433,8 +432,8 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch): except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() - tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) - tb_logger.tb_test_figures(current_step, test_figures) + tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) + tb_logger.tb_test_figures(global_step, test_figures) return avg_postnet_loss @@ -532,19 +531,19 @@ def main(args): #pylint: disable=redefined-outer-name if 'best_loss' not in locals(): best_loss = float('inf') - current_step = 0 + global_step = args.restore_step for epoch in range(0, c.epochs): # set gradual training if c.gradual_training is not None: - r, c.batch_size = gradual_training_scheduler(current_step, c) + r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder._set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) - train_loss, current_step = train(model, criterion, criterion_st, + train_loss, global_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, - ap, epoch) - val_loss = evaluate(model, criterion, criterion_st, ap, current_step, epoch) + ap, global_step, epoch) + val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch) print( " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), @@ -553,7 +552,7 @@ def main(args): #pylint: disable=redefined-outer-name if c.run_eval: target_loss = val_loss best_loss = save_best_model(model, optimizer, target_loss, best_loss, - OUT_PATH, current_step, epoch) + OUT_PATH, global_step, epoch) if __name__ == '__main__': From 02304a873f074d1e29c6dd9cd5918439c0cf0df9 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 13 Aug 2019 12:14:08 +0200 Subject: [PATCH 23/57] compute update --- .compute | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.compute b/.compute index 002f9136..2dbc7bb2 100644 --- a/.compute +++ b/.compute @@ -4,7 +4,7 @@ yes | apt-get install ffmpeg yes | apt-get install espeak yes | apt-get install tmux yes | apt-get install zsh -pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl +# pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl # wget https://www.dropbox.com/s/m8waow6b3ydpf6h/MozillaDataset.tar.gz?dl=0 -O /data/rw/home/mozilla.tar wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh @@ -13,4 +13,4 @@ python3 setup.py develop python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ -while true; do sleep 1000000; done +# while true; do sleep 1000000; done From 3cbbd8d6e02afdf722b6f46919658689759c9487 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 13 Aug 2019 12:14:21 +0200 Subject: [PATCH 24/57] config update --- config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.json b/config.json index ae29c287..ee83f660 100644 --- a/config.json +++ b/config.json @@ -50,7 +50,7 @@ "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":16, From 64f2b95c31493e7d2df0902083e224a98294c403 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 13 Aug 2019 12:14:34 +0200 Subject: [PATCH 25/57] update regarding torch 1.2 --- layers/common_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index 2edf0dab..4269728f 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -240,7 +240,7 @@ class Attention(nn.Module): attention_hidden_state, processed_inputs) # apply masking if mask is not None: - attention.data.masked_fill_(1 - mask, self._mask_value) + attention.data.masked_fill_(torch.bitwise_not(mask), self._mask_value) # apply windowing - only in eval mode if not self.training and self.windowing: attention = self.apply_windowing(attention, inputs) From 446cd6fa0656c002e1f5f2a94b611ab0770ac9b5 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 14 Aug 2019 12:32:09 +0200 Subject: [PATCH 26/57] small logging bug fix --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 565713ed..233c7576 100644 --- a/train.py +++ b/train.py @@ -258,7 +258,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, " | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " - "AvgStepTime:{:.2f}".format(global_step, avg_total_loss, + "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, avg_total_loss, avg_postnet_loss, avg_decoder_loss, avg_stop_loss, epoch_time, avg_step_time, avg_loader_time), From 5acd9e82bdf8c4e62eb11e3cd088e79b3ddd2ef8 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 16 Aug 2019 13:11:51 +0200 Subject: [PATCH 27/57] save model r value for checkpoints --- utils/generic_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 8a64dbae..1fa956ff 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -121,7 +121,8 @@ def save_checkpoint(model, optimizer, optimizer_st, model_loss, out_path, 'step': current_step, 'epoch': epoch, 'linear_loss': model_loss, - 'date': datetime.date.today().strftime("%B %d, %Y") + 'date': datetime.date.today().strftime("%B %d, %Y"), + 'r': model.decoder.r } torch.save(state, checkpoint_path) @@ -136,7 +137,8 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, 'step': current_step, 'epoch': epoch, 'linear_loss': model_loss, - 'date': datetime.date.today().strftime("%B %d, %Y") + 'date': datetime.date.today().strftime("%B %d, %Y"), + 'r': model.decoder.r } best_loss = model_loss bestmodel_path = 'best_model.pth.tar' From 728b97da3a5307ae7240a14de12cfd4022676606 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 16 Aug 2019 14:22:35 +0200 Subject: [PATCH 28/57] formatting for pylint --- config.json | 3 ++- train.py | 14 +++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/config.json b/config.json index ee83f660..58e9b92b 100644 --- a/config.json +++ b/config.json @@ -78,6 +78,7 @@ "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "text_cleaner": "phoneme_cleaners", - "use_speaker_embedding": false + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "style_wav_for_test": null // path to style wav file to be used in TacotronGST inference. } diff --git a/train.py b/train.py index e28d7aa2..16660a11 100644 --- a/train.py +++ b/train.py @@ -190,7 +190,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "LoaderTime:{:.2f} LR:{:.6f}".format( num_iter, batch_n_iter, global_step, loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item(), - grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, + grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, loader_time, current_lr), flush=True) @@ -259,9 +259,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, avg_total_loss, - avg_postnet_loss, avg_decoder_loss, - avg_stop_loss, epoch_time, avg_step_time, - avg_loader_time), + avg_postnet_loss, avg_decoder_loss, + avg_stop_loss, epoch_time, avg_step_time, + avg_loader_time), flush=True) # Plot Epoch Stats @@ -539,12 +539,12 @@ def main(args): #pylint: disable=redefined-outer-name if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r - model.decoder._set_r(r) + model.decoder.set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) train_loss, global_step = train(model, criterion, criterion_st, - optimizer, optimizer_st, scheduler, - ap, global_step, epoch) + optimizer, optimizer_st, scheduler, + ap, global_step, epoch) val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch) print( " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( From 8fde0ac00ec34197ab1a94539bf835a8c4be6ebc Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 16 Aug 2019 14:23:26 +0200 Subject: [PATCH 29/57] pylint --- layers/tacotron.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index d8d0e57a..09c7e923 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -397,9 +397,10 @@ class Decoder(nn.Module): if self.memory_size > self.r: # memory queue size is larger than number of frames per decoder iter self.memory_input = torch.cat([ - new_memory, self.memory_input[:, :(self.memory_size - self.r) * self.memory_dim].clone() + new_memory, self.memory_input[:, :( + self.memory_size - self.r) * self.memory_dim].clone() ], - dim=-1) + dim=-1) else: # memory queue size smaller than number of frames per decoder iter self.memory_input = new_memory[:, :self.memory_size * self.memory_dim] From 23d9f8a8bc0e58892058c102c2f06987f33643d7 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 16 Aug 2019 14:28:11 +0200 Subject: [PATCH 30/57] readme and config.json update --- README.md | 4 ++-- config.json | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 068c1762..39e507e1 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ TTS includes two different model implementations which are based on [Tacotron](h If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons. ## TTS Performance -

+

-[Details...](https://github.com/mozilla/TTS/issues/186) +[Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results) ## Requirements and Installation Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation. diff --git a/config.json b/config.json index 58e9b92b..2089c577 100644 --- a/config.json +++ b/config.json @@ -39,13 +39,13 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". - "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. "forward_attn_mask": false, - "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. - "location_attn": true, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "loss_masking": true, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. "stopnet": true, // Train stopnet predicting the end of synthesis. @@ -55,7 +55,7 @@ "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":16, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. + "gradual_training": [[0, 7, 32], [10000, 5, 32], [50000, 3, 32], [130000, 2, 16], [290000, 1, 8]], // ONLY TACOTRON - set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 10000, // Number of training steps expected to save traning stats and checkpoints. From 5629292bde3fd5bf7c81d242274bc2c8aa07c6e5 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 16 Aug 2019 15:08:04 +0200 Subject: [PATCH 31/57] bug fixes --- layers/tacotron.py | 8 ++++---- tests/symbols_tests.py | 4 +++- tests/test_loader.py | 8 +++++--- utils/text/symbols.py | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index 09c7e923..b5b6e132 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -364,13 +364,13 @@ class Decoder(nn.Module): processed_memory = self.prenet(self.memory_input) # Attention RNN self.attention_rnn_hidden = self.attention_rnn( - torch.cat((processed_memory, self.current_context_vec), -1), + torch.cat((processed_memory, self.context_vec), -1), self.attention_rnn_hidden) - self.context_vec = self.attention_layer( + self.context_vec = self.attention( self.attention_rnn_hidden, inputs, self.processed_inputs, mask) # Concat RNN output and attention context vector decoder_input = self.project_to_decoder_in( - torch.cat((self.query, self.context_vec), -1)) + torch.cat((self.attention_rnn_hidden, self.context_vec), -1)) # Pass through the decoder RNNs for idx in range(len(self.decoder_rnns)): @@ -390,7 +390,7 @@ class Decoder(nn.Module): else: stop_token = self.stopnet(stopnet_input) output = output[:, : self.r * self.memory_dim] - return output, stop_token, self.attention_layer.attention_weights + return output, stop_token, self.attention.attention_weights def _update_memory_input(self, new_memory): if self.use_memory_queue: diff --git a/tests/symbols_tests.py b/tests/symbols_tests.py index 68c909c5..9bec0f18 100644 --- a/tests/symbols_tests.py +++ b/tests/symbols_tests.py @@ -1,7 +1,9 @@ import unittest from utils.text import phonemes +from collections import Counter class SymbolsTest(unittest.TestCase): def test_uniqueness(self): - assert sorted(phonemes) == sorted(list(set(phonemes))) + assert sorted(phonemes) == sorted(list(set(phonemes))), " {} vs {} ".format(len(phonemes), len(set(phonemes))) + \ No newline at end of file diff --git a/tests/test_loader.py b/tests/test_loader.py index 92d6f7e2..4051c463 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -1,6 +1,7 @@ import os import unittest import shutil +import torch from torch.utils.data import DataLoader from utils.generic_utils import load_config @@ -130,10 +131,11 @@ class TestTTSDataset(unittest.TestCase): # check mel_spec consistency wav = self.ap.load_wav(item_idx[0]) mel = self.ap.melspectrogram(wav) - mel_dl = mel_input[0].cpu().numpy() - assert (abs(mel.T).astype("float32") + mel = torch.FloatTensor(mel) + mel_dl = mel_input[0] + assert (abs(mel.T) - abs(mel_dl[:-1]) - ).sum() == 0 + ).sum() == 0, (abs(mel.T)- abs(mel_dl[:-1])).sum() # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() diff --git a/utils/text/symbols.py b/utils/text/symbols.py index 9b7a36b4..ee6fd2cf 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -18,7 +18,7 @@ _vowels = 'iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ' _non_pulmonic_consonants = 'ʘɓǀɗǃʄǂɠǁʛ' _pulmonic_consonants = 'pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ' _suprasegmentals = 'ˈˌːˑ' -_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ ' +_other_symbols = 'ʍwɥʜʢʡɕʑɺɧ' _diacrilics = 'ɚ˞ɫ' _phonemes = sorted(list(_vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics)) From e1b3c41af5cbe21963aaae35cf76b1e43449881c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 16 Aug 2019 15:40:58 +0200 Subject: [PATCH 32/57] lossen the assert condition due to a probable bug in Pytorch 1.2 --- tests/test_loader.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_loader.py b/tests/test_loader.py index 4051c463..dd23e530 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -2,6 +2,7 @@ import os import unittest import shutil import torch +import numpy as np from torch.utils.data import DataLoader from utils.generic_utils import load_config @@ -129,13 +130,16 @@ class TestTTSDataset(unittest.TestCase): item_idx = data[7] # check mel_spec consistency - wav = self.ap.load_wav(item_idx[0]) - mel = self.ap.melspectrogram(wav) - mel = torch.FloatTensor(mel) + wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) + mel = self.ap.melspectrogram(wav).astype('float32') + mel = torch.FloatTensor(mel).contiguous() mel_dl = mel_input[0] - assert (abs(mel.T) + # NOTE: Below needs to check == 0 but due to an unknown reason + # there is a slight difference between two matrices. + # TODO: Check this assert cond more in detail. + assert abs((abs(mel.T) - abs(mel_dl[:-1]) - ).sum() == 0, (abs(mel.T)- abs(mel_dl[:-1])).sum() + ).sum()) < 1e-5, (abs(mel.T)- abs(mel_dl[:-1])).sum() # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() From c637aa04a2602438d145aa5745ac8f3bf057399b Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 16 Aug 2019 15:49:12 +0200 Subject: [PATCH 33/57] pylint --- layers/tacotron.py | 3 +-- tests/symbols_tests.py | 3 +-- tests/test_loader.py | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/layers/tacotron.py b/layers/tacotron.py index b5b6e132..329fdb11 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -399,8 +399,7 @@ class Decoder(nn.Module): self.memory_input = torch.cat([ new_memory, self.memory_input[:, :( self.memory_size - self.r) * self.memory_dim].clone() - ], - dim=-1) + ], dim=-1) else: # memory queue size smaller than number of frames per decoder iter self.memory_input = new_memory[:, :self.memory_size * self.memory_dim] diff --git a/tests/symbols_tests.py b/tests/symbols_tests.py index 9bec0f18..d35e887e 100644 --- a/tests/symbols_tests.py +++ b/tests/symbols_tests.py @@ -1,9 +1,8 @@ import unittest from utils.text import phonemes -from collections import Counter class SymbolsTest(unittest.TestCase): - def test_uniqueness(self): + def test_uniqueness(self): #pylint: disable=no-self-use assert sorted(phonemes) == sorted(list(set(phonemes))), " {} vs {} ".format(len(phonemes), len(set(phonemes))) \ No newline at end of file diff --git a/tests/test_loader.py b/tests/test_loader.py index dd23e530..9d151820 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -138,8 +138,8 @@ class TestTTSDataset(unittest.TestCase): # there is a slight difference between two matrices. # TODO: Check this assert cond more in detail. assert abs((abs(mel.T) - - abs(mel_dl[:-1]) - ).sum()) < 1e-5, (abs(mel.T)- abs(mel_dl[:-1])).sum() + - abs(mel_dl[:-1]) + ).sum()) < 1e-5, (abs(mel.T) - abs(mel_dl[:-1])).sum() # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() From 72ad58d893fd74ac98bf21269c766a02ccb44bf7 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 19 Aug 2019 16:24:28 +0200 Subject: [PATCH 34/57] change the bitwise for masking and small fixes --- datasets/TTSDataset.py | 1 - layers/common_layers.py | 2 +- layers/tacotron.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index 0305da7f..dea6cb8c 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -103,7 +103,6 @@ class MyDataset(Dataset): if self.enable_eos_bos: phonemes = pad_with_eos_bos(phonemes) phonemes = np.asarray(phonemes, dtype=np.int32) - return phonemes def load_data(self, idx): diff --git a/layers/common_layers.py b/layers/common_layers.py index 98fc70ae..d5836a9f 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -234,7 +234,7 @@ class Attention(nn.Module): query, processed_inputs) # apply masking if mask is not None: - attention.data.masked_fill_(torch.bitwise_not(mask), self._mask_value) + attention.data.masked_fill_(~mask, self._mask_value) # apply windowing - only in eval mode if not self.training and self.windowing: attention = self.apply_windowing(attention, inputs) diff --git a/layers/tacotron.py b/layers/tacotron.py index 329fdb11..788e5230 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -315,7 +315,7 @@ class Decoder(nn.Module): # learn init values instead of zero init. self.stopnet = StopNet(256 + memory_dim * self.r_init) - def _set_r(self, new_r): + def set_r(self, new_r): self.r = new_r def _reshape_memory(self, memory): From d99623e28542be2f2b4db9f9bd1aaf45affea337 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 19 Aug 2019 16:27:53 +0200 Subject: [PATCH 35/57] bug fixes for logging --- train.py | 7 +++++-- utils/text/__init__.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index 16660a11..5c45181b 100644 --- a/train.py +++ b/train.py @@ -94,7 +94,10 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, avg_step_time = 0 avg_loader_time = 0 print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) + if use_cuda: + batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) + else: + batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -423,7 +426,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav) - file_path = os.path.join(AUDIO_PATH, str(current_step)) + file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 77cc23a5..226e2e8d 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -17,7 +17,7 @@ _ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)} # Regular expression matching text enclosed in curly braces: _CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') -# Regular expression matchinf punctuations, ignoring empty space +# Regular expression matching punctuations, ignoring empty space PHONEME_PUNCTUATION_PATTERN = r'['+_phoneme_punctuations+']+' From 5ff8544d6a2b94eb1bf6b6a14279a78b38555990 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 20 Aug 2019 13:22:04 +0200 Subject: [PATCH 36/57] force frame_length to be a multiple hop_length --- utils/audio.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/audio.py b/utils/audio.py index d4d9d67f..794520af 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -113,8 +113,10 @@ class AudioProcessor(object): def _stft_parameters(self, ): """Compute necessary stft parameters with given time values""" n_fft = (self.num_freq - 1) * 2 + factor = self.frame_length_ms / self.frame_shift_ms + assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) - win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate) + win_length = int(hop_length * factor) return n_fft, hop_length, win_length def _amp_to_db(self, x): From ff01490e48488fde2d103db63c547e279106c569 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 21 Aug 2019 16:09:09 +0200 Subject: [PATCH 37/57] server bug fix --- server/synthesizer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index bdfd8c6c..f5abdf50 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -145,9 +145,11 @@ class Synthesizer(object): print(sen) seq = np.array(self.input_adapter(sen)) - text_hat = sequence_to_phoneme(seq) - print(text_hat) - + + if self.use_phonemes: + text_hat = sequence_to_phoneme(seq) + print(text_hat) + chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: From 1a1db23df12bf128cbf2a5cbabc10057ae1046f6 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Aug 2019 00:34:46 +0200 Subject: [PATCH 38/57] radam --- config.json | 2 +- train.py | 5 +- utils/radam.py | 207 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 211 insertions(+), 3 deletions(-) create mode 100644 utils/radam.py diff --git a/config.json b/config.json index 2089c577..4d56c3dc 100644 --- a/config.json +++ b/config.json @@ -71,7 +71,7 @@ "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length - "output_path": "/media/erogol/data_ssd/Models/libri_tts/", // DATASET-RELATED: output path for all training outputs. + "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. diff --git a/train.py b/train.py index 5c45181b..2324e879 100644 --- a/train.py +++ b/train.py @@ -27,6 +27,7 @@ from utils.speakers import load_speaker_mapping, save_speaker_mapping, \ from utils.synthesis import synthesis from utils.text.symbols import phonemes, symbols from utils.visual import plot_alignment, plot_spectrogram +from utils.radam import RAdam from datasets.preprocess import get_preprocessor_by_name torch.backends.cudnn.enabled = True @@ -476,9 +477,9 @@ def main(args): #pylint: disable=redefined-outer-name print(" | > Num output units : {}".format(ap.num_freq), flush=True) - optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) + optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: - optimizer_st = optim.Adam( + optimizer_st = RAdam( model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None diff --git a/utils/radam.py b/utils/radam.py new file mode 100644 index 00000000..d3a65dc5 --- /dev/null +++ b/utils/radam.py @@ -0,0 +1,207 @@ +import math +import torch +from torch.optim.optimizer import Optimizer, required + +class RAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + self.buffer = [[None, None, None] for ind in range(10)] + super(RAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(RAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + buffered = self.buffer[int(state['step'] % 10)] + if state['step'] == buffered[0]: + N_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = N_sma + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + else: + step_size = group['lr'] / (1 - beta1 ** state['step']) + buffered[2] = step_size + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + + # more conservative since it's an approximated value + if N_sma >= 5: + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + else: + p_data_fp32.add_(-step_size, exp_avg) + + p.data.copy_(p_data_fp32) + + return loss + +class PlainRAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + + super(PlainRAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(PlainRAdam, self).__setstate__(state) + + def step(self, closure=None): + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + state['step'] += 1 + beta2_t = beta2 ** state['step'] + N_sma_max = 2 / (1 - beta2) - 1 + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + + # more conservative since it's an approximated value + if N_sma >= 5: + step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + else: + step_size = group['lr'] / (1 - beta1 ** state['step']) + p_data_fp32.add_(-step_size, exp_avg) + + p.data.copy_(p_data_fp32) + + return loss + + +class AdamW(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0): + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, warmup = warmup) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data.float() + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + p_data_fp32 = p.data.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_data_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1 - beta1, grad) + + denom = exp_avg_sq.sqrt().add_(group['eps']) + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + if group['warmup'] > state['step']: + scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] + else: + scheduled_lr = group['lr'] + + step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 + + if group['weight_decay'] != 0: + p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) + + p_data_fp32.addcdiv_(-step_size, exp_avg, denom) + + p.data.copy_(p_data_fp32) + + return loss From 549dbba128860ed31ab35514ee0b171dbcd2643c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Aug 2019 00:36:17 +0200 Subject: [PATCH 39/57] linter --- server/synthesizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index f5abdf50..f7edd84f 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -145,7 +145,6 @@ class Synthesizer(object): print(sen) seq = np.array(self.input_adapter(sen)) - if self.use_phonemes: text_hat = sequence_to_phoneme(seq) print(text_hat) From 52beca60e13d17ab76a313b6e2a5c15644ac5490 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Wed, 21 Aug 2019 18:46:22 +0200 Subject: [PATCH 40/57] Add a TTS namespace to all packages in setup.py --- setup.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index b1c4c7ac..7757debe 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,8 @@ setup( version=version, url='https://github.com/mozilla/TTS', description='Text to Speech with Deep Learning', - packages=find_packages(), + package_dir={'TTS': '.'}, + packages=['TTS'] + ['TTS.' + pkg for pkg in find_packages()], cmdclass={ 'build_py': build_py, 'develop': develop, @@ -79,14 +80,9 @@ setup( "flask", # "lws", "tqdm", - "phonemizer", "soundfile", ], dependency_links=[ 'http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer' ], - extras_require={ - "bin": [ - "requests", - ], - }) +) From 0a717faf1cacffe66f3bed41095b5093b199785c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Aug 2019 15:28:11 +0200 Subject: [PATCH 41/57] reduce TB load for step stats --- train.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/train.py b/train.py index 5c45181b..71cbb398 100644 --- a/train.py +++ b/train.py @@ -212,13 +212,15 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, avg_loader_time += loader_time # Plot Training Iter Stats - iter_stats = {"loss_posnet": postnet_loss.item(), - "loss_decoder": decoder_loss.item(), - "lr": current_lr, - "grad_norm": grad_norm, - "grad_norm_st": grad_norm_st, - "step_time": step_time} - tb_logger.tb_train_iter_stats(global_step, iter_stats) + # reduce TB load + if global_step % 10 == 0: + iter_stats = {"loss_posnet": postnet_loss.item(), + "loss_decoder": decoder_loss.item(), + "lr": current_lr, + "grad_norm": grad_norm, + "grad_norm_st": grad_norm_st, + "step_time": step_time} + tb_logger.tb_train_iter_stats(global_step, iter_stats) if global_step % c.save_step == 0: if c.checkpoint: From 97ffa2b44ed81a7f892b45e263e2029f4e401169 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 22 Aug 2019 15:28:46 +0200 Subject: [PATCH 42/57] indent --- train.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/train.py b/train.py index 71cbb398..1f808e49 100644 --- a/train.py +++ b/train.py @@ -215,11 +215,11 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # reduce TB load if global_step % 10 == 0: iter_stats = {"loss_posnet": postnet_loss.item(), - "loss_decoder": decoder_loss.item(), - "lr": current_lr, - "grad_norm": grad_norm, - "grad_norm_st": grad_norm_st, - "step_time": step_time} + "loss_decoder": decoder_loss.item(), + "lr": current_lr, + "grad_norm": grad_norm, + "grad_norm_st": grad_norm_st, + "step_time": step_time} tb_logger.tb_train_iter_stats(global_step, iter_stats) if global_step % c.save_step == 0: From e02fc51fde25ad2a66f630f79a5d6b8a8c9fedc7 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 23 Aug 2019 12:28:05 +0200 Subject: [PATCH 43/57] server update for changing r value --- server/synthesizer.py | 50 ++++++++++++++++++------------------------- utils/synthesis.py | 4 ++-- 2 files changed, 23 insertions(+), 31 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index f7edd84f..a0063a3d 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -9,6 +9,7 @@ from utils.audio import AudioProcessor from utils.generic_utils import load_config, setup_model from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme from utils.speakers import load_speaker_mapping +from utils.synthesis import * import re alphabets = r"([A-Za-z])" @@ -41,28 +42,25 @@ class Synthesizer(object): self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) - self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) - self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 - self.tts_model = setup_model(self.input_size, num_speakers=num_speakers , c=self.tts_config) + self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state - if use_cuda: - cp = torch.load(self.model_file) - else: - cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) + cp = torch.load(self.model_file) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 + if 'r' in cp: + self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. @@ -136,33 +134,27 @@ class Synthesizer(object): def tts(self, text): wavs = [] sens = self.split_into_sentences(text) + print(sens) if not sens: sens = [text+'.'] for sen in sens: - if len(sen) < 3: - continue - sen = sen.strip() - print(sen) + # preprocess the given text + inputs = text_to_seqvec(text, self.tts_config, self.use_cuda) + # synthesize voice + decoder_output, postnet_output, alignments, stop_tokens = run_model( + self.tts_model, inputs, self.tts_config, False, None, None) + # convert outputs to numpy + postnet_output, decoder_output, alignment = parse_outputs( + postnet_output, decoder_output, alignments) - seq = np.array(self.input_adapter(sen)) - if self.use_phonemes: - text_hat = sequence_to_phoneme(seq) - print(text_hat) - - chars_var = torch.from_numpy(seq).unsqueeze(0).long() + if self.wavernn: + postnet_output = postnet_output[0].data.cpu().numpy() + wav = self.wavernn.generate(torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) + else: + wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) + # trim silence + wav = trim_silence(wav, self.ap) - if self.use_cuda: - chars_var = chars_var.cuda() - decoder_out, postnet_out, alignments, stop_tokens = self.tts_model.inference( - chars_var) - postnet_out = postnet_out[0].data.cpu().numpy() - if self.tts_config.model == "Tacotron": - wav = self.ap.inv_spectrogram(postnet_out.T) - elif self.tts_config.model == "Tacotron2": - if self.wavernn: - wav = self.wavernn.generate(torch.FloatTensor(postnet_out.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) - else: - wav = self.ap.inv_mel_spectrogram(postnet_out.T) wavs += list(wav) wavs += [0] * 10000 diff --git a/utils/synthesis.py b/utils/synthesis.py index 7d7bf604..f657eb4d 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -50,7 +50,7 @@ def parse_outputs(postnet_output, decoder_output, alignments): return postnet_output, decoder_output, alignment -def trim_silence(wav): +def trim_silence(wav, ap): return wav[:ap.find_endpoint(wav)] @@ -114,5 +114,5 @@ def synthesis(model, wav = inv_spectrogram(postnet_output, ap, CONFIG) # trim silence if do_trim_silence: - wav = trim_silence(wav) + wav = trim_silence(wav, ap) return wav, alignment, decoder_output, postnet_output, stop_tokens From a757c6240e56b1002fdbdc0f02d52f774cd9f65e Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 23 Aug 2019 13:07:58 +0200 Subject: [PATCH 44/57] new Benchmark notebook --- notebooks/Benchmark.ipynb | 372 ++++++++++---------------------------- 1 file changed, 96 insertions(+), 276 deletions(-) diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb index f06aca80..13e31ed9 100644 --- a/notebooks/Benchmark.ipynb +++ b/notebooks/Benchmark.ipynb @@ -20,9 +20,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "TTS_PATH = \"/home/erogol/projects/\"\n", @@ -33,7 +31,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, "scrolled": true }, "outputs": [], @@ -45,6 +42,7 @@ "import io\n", "import torch \n", "import time\n", + "import json\n", "import numpy as np\n", "from collections import OrderedDict\n", "from matplotlib import pylab as plt\n", @@ -72,23 +70,23 @@ "from IPython.display import Audio\n", "\n", "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", - "os.environ['OMP_NUM_THREADS']='1'\n" + "os.environ['CUDA_VISIBLE_DEVICES']='1'" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):\n", + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False, speaker_id=speaker_id, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " # coorect the normalization differences b/w TTS and the Vocoder.\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", + " mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n", " if not use_gl:\n", " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)\n", "\n", @@ -106,19 +104,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Set constants\n", - "ROOT_PATH = '/media/erogol/data_ssd/Data/models/mozilla_models/4845/'\n", - "MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'\n", + "ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5049/'\n", + "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = \"/home/erogol/Dropbox/AudioSamples/benchmark_samples/\"\n", + "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", "CONFIG = load_config(CONFIG_PATH)\n", - "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\n", - "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\n", + "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/universal/4910/best_model_16K.pth.tar\"\n", + "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/universal/4910/config_16K.json\"\n", "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n", "use_cuda = False\n", "\n", @@ -126,6 +122,8 @@ "# CONFIG.windowing = False\n", "# CONFIG.prenet_dropout = False\n", "# CONFIG.separate_stopnet = True\n", + "# CONFIG.use_forward_attn = True\n", + "# CONFIG.forward_attn_mask = True\n", "# CONFIG.stopnet = True\n", "\n", "# Set the vocoder\n", @@ -136,17 +134,23 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# LOAD TTS MODEL\n", "from utils.text.symbols import symbols, phonemes\n", "\n", + "# multi speaker \n", + "if CONFIG.use_speaker_embedding:\n", + " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", + " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", + "else:\n", + " speakers = []\n", + " speaker_id = None\n", + "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, CONFIG)\n", + "model = setup_model(num_chars, len(speakers), CONFIG)\n", "\n", "# load the audio processor\n", "ap = AudioProcessor(**CONFIG.audio) \n", @@ -163,39 +167,45 @@ "if use_cuda:\n", " model.cuda()\n", "model.eval()\n", - "print(cp['step'])" + "print(cp['step'])\n", + "print(cp['r'])\n", + "\n", + "# set model stepsize \n", + "if 'r' in cp:\n", + " model.decoder.set_r(cp['r'])" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# LOAD WAVERNN\n", "if use_gl == False:\n", " from WaveRNN.models.wavernn import Model\n", + " from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n", " bits = 10\n", - "\n", + " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n", " wavernn = Model(\n", " rnn_dims=512,\n", " fc_dims=512,\n", - " mode=\"mold\",\n", - " pad=2,\n", - " upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset\n", + " mode=VOCODER_CONFIG.mode,\n", + " mulaw=VOCODER_CONFIG.mulaw,\n", + " pad=VOCODER_CONFIG.pad,\n", + " upsample_factors=VOCODER_CONFIG.upsample_factors,\n", " feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n", " compute_dims=128,\n", " res_out_dims=128,\n", " res_blocks=10,\n", - " hop_length=ap.hop_length,\n", - " sample_rate=ap.sample_rate,\n", + " hop_length=ap_vocoder.hop_length,\n", + " sample_rate=ap_vocoder.sample_rate,\n", + " use_upsample_net = True,\n", + " use_aux_net = True\n", " ).cuda()\n", "\n", - "\n", " check = torch.load(VOCODER_MODEL_PATH)\n", - " wavernn.load_state_dict(check['model'])\n", + " wavernn.load_state_dict(check['model'], strict=False)\n", " if use_cuda:\n", " wavernn.cuda()\n", " wavernn.eval();\n", @@ -209,70 +219,75 @@ "### Comparision with https://mycroft.ai/blog/available-voices/" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.eval()\n", + "model.decoder.max_decoder_steps = 2000\n", + "speaker_id = 500\n", + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "scrolled": false + "scrolled": true }, "outputs": [], "source": [ "model.eval()\n", "model.decoder.max_decoder_steps = 2000\n", - "speaker_id = 0\n", - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Seine Fuerenden Berater hatten Donald Trump seit Wochen beschworen, berichteten US-Medien: Lassen Sie das mit den Zoellen bleiben.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, "scrolled": true }, "outputs": [], "source": [ - "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Der Klimawandel bedroht die Gletscher im Himalaya.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"The human voice is the most perfect instrument of all.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Zwei Unternehmen verlieren einem Medienbericht zufolge ihre Verträge als Maut-Inkasso-Manager.\" # 'echo' is not in training set. \n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Eine Ausländermaut nach dem Geschmack der CSU wird es nicht geben - das bedauert außerhalb der Partei fast niemand.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, "scrolled": true }, "outputs": [], "source": [ - "sentence = \"This cake is great. It's so delicious and moist.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Angela Merkel ist als Klimakanzlerin gestartet.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { @@ -285,61 +300,51 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Dann vernachlässigte sie das Thema.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Nun, kurz vor dem Ende, will sie damit noch einmal neu anfangen.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Nun ist der Spieltempel pleite, und manchen Dorfbewohnern fehlt das Geld zum Essen.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence = \"Andrea Nahles will in der Fraktion die Vertrauensfrage stellen.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence=\"Die Erfolge der Grünen bringen eine Reihe Unerfahrener in die Parlamente.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { @@ -352,212 +357,27 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \" He has read the whole thing.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"He reads books.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], - "source": [ - "sentence = \"Thisss isrealy awhsome.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": false - }, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser, Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Eren, how are you?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", "metadata": {}, - "source": [ - "### Hard Sentences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, "outputs": [], "source": [ - "sentence = \"Encouraged, he started with a minute a day.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence=\"Die Luftfahrtbranche arbeitet daran, CO2-neutral zu werden.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ - "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + "sentence=\"Michael Kretschmer versucht seit Monaten, die Bürger zu umgarnen.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "sentence = \"If he decided to watch TV he really watched it.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true, - "scrolled": true - }, - "outputs": [], - "source": [ - "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "# for twb dataset\n", - "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# !zip benchmark_samples/samples.zip benchmark_samples/*" @@ -566,9 +386,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3(mztts)", + "display_name": "Python 3", "language": "python", - "name": "mztts" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -580,9 +400,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.7.3" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From b74af15c7738ed640d111d8a36d075c45cba3295 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 23 Aug 2019 13:10:59 +0200 Subject: [PATCH 45/57] linter --- server/synthesizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index a0063a3d..414a15fe 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -7,7 +7,7 @@ import sys from utils.audio import AudioProcessor from utils.generic_utils import load_config, setup_model -from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme +from utils.text import phonemes, symbols from utils.speakers import load_speaker_mapping from utils.synthesis import * @@ -139,12 +139,12 @@ class Synthesizer(object): sens = [text+'.'] for sen in sens: # preprocess the given text - inputs = text_to_seqvec(text, self.tts_config, self.use_cuda) + inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) # synthesize voice - decoder_output, postnet_output, alignments, stop_tokens = run_model( + decoder_output, postnet_output, alignments, _ = run_model( self.tts_model, inputs, self.tts_config, False, None, None) # convert outputs to numpy - postnet_output, decoder_output, alignment = parse_outputs( + postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) if self.wavernn: From 1e6f5113d561325efeb420b2e2f7c22f52ccad1b Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 23 Aug 2019 13:22:40 +0200 Subject: [PATCH 46/57] small bug fix for tacotron2 used in server --- server/conf.json | 10 +++++----- server/synthesizer.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/server/conf.json b/server/conf.json index 6341596d..c8861cd1 100644 --- a/server/conf.json +++ b/server/conf.json @@ -1,12 +1,12 @@ { - "tts_path":"/media/erogol/data_ssd/Models/libri_tts/ljspeech-July-22-2019_10+45AM-ee706b5/", // tts model root folder + "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder "tts_file":"best_model.pth.tar", // tts checkpoint file "tts_config":"config.json", // tts config.json file "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. - "wavernn_lib_path": "/home/erogol/projects/", // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. - "wavernn_path":"/media/erogol/data_ssd/Models/wavernn/universal/4910/", // wavernn model root path - "wavernn_file":"best_model_16K.pth.tar", // wavernn checkpoint file name - "wavernn_config":"config_16K.json", // wavernn config file + "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. + "wavernn_path":null, // wavernn model root path + "wavernn_file":null, // wavernn checkpoint file name + "wavernn_config": null, // wavernn config file "is_wavernn_batched":true, "port": 5002, "use_cuda": true, diff --git a/server/synthesizer.py b/server/synthesizer.py index 414a15fe..3848968f 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -59,7 +59,7 @@ class Synthesizer(object): self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 - if 'r' in cp: + if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): From ea84b2997b3153826d5b00d0ccea2064dc4e589b Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 26 Aug 2019 15:18:43 +0200 Subject: [PATCH 47/57] more setup.py --- setup.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/setup.py b/setup.py index 7757debe..e9fcc574 100644 --- a/setup.py +++ b/setup.py @@ -62,8 +62,15 @@ setup( version=version, url='https://github.com/mozilla/TTS', description='Text to Speech with Deep Learning', + license='MPL-2.0', package_dir={'TTS': '.'}, packages=['TTS'] + ['TTS.' + pkg for pkg in find_packages()], + project_urls={ + 'Documentation': 'https://github.com/mozilla/TTS/wiki', + 'Tracker': 'https://github.com/mozilla/TTS/issues', + 'Repository': 'https://github.com/mozilla/TTS', + 'Discussions': 'https://discourse.mozilla.org/c/tts', + }, cmdclass={ 'build_py': build_py, 'develop': develop, From d9e56d2b4c4a0eb2c9d69f610c3dc628e52aaec3 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Mon, 26 Aug 2019 16:36:46 +0200 Subject: [PATCH 48/57] Use direct link dependency for phonemizer --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 7757debe..b824b106 100644 --- a/setup.py +++ b/setup.py @@ -81,8 +81,6 @@ setup( # "lws", "tqdm", "soundfile", - ], - dependency_links=[ - 'http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer' + "phonemizer @ https://github.com/bootphon/phonemizer/tarball/master", ], ) From 2563fb873eaaa54f6539012d1c7e9fad12fb474b Mon Sep 17 00:00:00 2001 From: Eugene Ingerman Date: Sat, 24 Aug 2019 14:17:20 -0700 Subject: [PATCH 49/57] Fixed postnet for GST. --- models/tacotrongst.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/models/tacotrongst.py b/models/tacotrongst.py index 5b372338..c18a5e98 100644 --- a/models/tacotrongst.py +++ b/models/tacotrongst.py @@ -38,9 +38,8 @@ class TacotronGST(nn.Module): forward_attn, trans_agent, forward_attn_mask, location_attn, separate_stopnet) self.postnet = PostCBHG(mel_dim) - self.last_linear = nn.Sequential( - nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim), - nn.Sigmoid()) + self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim) + def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): B = characters.size(0) From 3c5aeb5e22e7938aef41702eab8a1ee683209f75 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 29 Aug 2019 11:49:53 +0200 Subject: [PATCH 50/57] Fix installation by using an explicit symlink --- __init__.py | 0 datasets/TTSDataset.py | 4 ++-- distribute.py | 2 +- layers/losses.py | 2 +- models/tacotron.py | 4 ++-- models/tacotron2.py | 4 ++-- models/tacotrongst.py | 6 +++--- notebooks/Benchmark.ipynb | 2 +- notebooks/ExtractTTSpectrogram.ipynb | 8 ++++---- server/server.py | 2 +- server/synthesizer.py | 10 +++++----- setup.py | 4 ++-- synthesize.py | 8 ++++---- tests/generic_utils_text.py | 4 ++-- tests/symbols_tests.py | 2 +- tests/test_audio.py | 6 +++--- tests/test_demo_server.py | 8 ++++---- tests/test_layers.py | 6 +++--- tests/test_loader.py | 8 ++++---- tests/test_preprocessors.py | 4 ++-- tests/test_tacotron2_model.py | 6 +++--- tests/test_tacotron_model.py | 6 +++--- tests/test_text_processing.py | 2 +- train.py | 30 ++++++++++++++-------------- tts_namespace/README.md | 29 +++++++++++++++++++++++++++ tts_namespace/TTS | 1 + utils/speakers.py | 2 +- utils/text/__init__.py | 4 ++-- utils/visual.py | 2 +- 29 files changed, 103 insertions(+), 73 deletions(-) create mode 100644 __init__.py create mode 100644 tts_namespace/README.md create mode 120000 tts_namespace/TTS diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index dea6cb8c..cbb4bf97 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -5,8 +5,8 @@ import torch import random from torch.utils.data import Dataset -from utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos -from utils.data import prepare_data, prepare_tensor, prepare_stop_target +from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos +from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target class MyDataset(Dataset): diff --git a/distribute.py b/distribute.py index 22c27b1c..f65fbe71 100644 --- a/distribute.py +++ b/distribute.py @@ -9,7 +9,7 @@ import torch.distributed as dist from torch.utils.data.sampler import Sampler from torch.autograd import Variable from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from utils.generic_utils import load_config, create_experiment_folder +from TTS.utils.generic_utils import load_config, create_experiment_folder class DistributedSampler(Sampler): diff --git a/layers/losses.py b/layers/losses.py index 5a95c0fe..a6bf95d3 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -1,6 +1,6 @@ from torch import nn from torch.nn import functional -from utils.generic_utils import sequence_mask +from TTS.utils.generic_utils import sequence_mask class L1LossMasked(nn.Module): diff --git a/models/tacotron.py b/models/tacotron.py index bf312db4..69a6fa03 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -1,7 +1,7 @@ # coding: utf-8 from torch import nn -from layers.tacotron import Encoder, Decoder, PostCBHG -from utils.generic_utils import sequence_mask +from TTS.layers.tacotron import Encoder, Decoder, PostCBHG +from TTS.utils.generic_utils import sequence_mask class Tacotron(nn.Module): diff --git a/models/tacotron2.py b/models/tacotron2.py index 05b4c0fd..a91d6e2e 100644 --- a/models/tacotron2.py +++ b/models/tacotron2.py @@ -1,7 +1,7 @@ from math import sqrt from torch import nn -from layers.tacotron2 import Encoder, Decoder, Postnet -from utils.generic_utils import sequence_mask +from TTS.layers.tacotron2 import Encoder, Decoder, Postnet +from TTS.utils.generic_utils import sequence_mask # TODO: match function arguments with tacotron diff --git a/models/tacotrongst.py b/models/tacotrongst.py index c18a5e98..5ea389d9 100644 --- a/models/tacotrongst.py +++ b/models/tacotrongst.py @@ -1,8 +1,8 @@ # coding: utf-8 from torch import nn -from layers.tacotron import Encoder, Decoder, PostCBHG -from layers.gst_layers import GST -from utils.generic_utils import sequence_mask +from TTS.layers.tacotron import Encoder, Decoder, PostCBHG +from TTS.layers.gst_layers import GST +from TTS.utils.generic_utils import sequence_mask class TacotronGST(nn.Module): diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb index 13e31ed9..81f81641 100644 --- a/notebooks/Benchmark.ipynb +++ b/notebooks/Benchmark.ipynb @@ -138,7 +138,7 @@ "outputs": [], "source": [ "# LOAD TTS MODEL\n", - "from utils.text.symbols import symbols, phonemes\n", + "from TTS.utils.text.symbols import symbols, phonemes\n", "\n", "# multi speaker \n", "if CONFIG.use_speaker_embedding:\n", diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index f044300d..a0d0be60 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -105,10 +105,10 @@ "metadata": {}, "outputs": [], "source": [ - "from utils.text.symbols import symbols, phonemes\n", - "from utils.generic_utils import sequence_mask\n", - "from layers.losses import L1LossMasked\n", - "from utils.text.symbols import symbols, phonemes\n", + "from TTS.utils.text.symbols import symbols, phonemes\n", + "from TTS.utils.generic_utils import sequence_mask\n", + "from TTS.layers.losses import L1LossMasked\n", + "from TTS.utils.text.symbols import symbols, phonemes\n", "\n", "# load the model\n", "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", diff --git a/server/server.py b/server/server.py index 95fa1caf..0244d612 100644 --- a/server/server.py +++ b/server/server.py @@ -1,7 +1,7 @@ #!flask/bin/python import argparse from synthesizer import Synthesizer -from utils.generic_utils import load_config +from TTS.utils.generic_utils import load_config from flask import Flask, request, render_template, send_file parser = argparse.ArgumentParser() diff --git a/server/synthesizer.py b/server/synthesizer.py index 3848968f..00311914 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -5,11 +5,11 @@ import numpy as np import torch import sys -from utils.audio import AudioProcessor -from utils.generic_utils import load_config, setup_model -from utils.text import phonemes, symbols -from utils.speakers import load_speaker_mapping -from utils.synthesis import * +from TTS.utils.audio import AudioProcessor +from TTS.utils.generic_utils import load_config, setup_model +from TTS.utils.text import phonemes, symbols +from TTS.utils.speakers import load_speaker_mapping +from TTS.utils.synthesis import * import re alphabets = r"([A-Za-z])" diff --git a/setup.py b/setup.py index b824b106..aae8a015 100644 --- a/setup.py +++ b/setup.py @@ -62,8 +62,8 @@ setup( version=version, url='https://github.com/mozilla/TTS', description='Text to Speech with Deep Learning', - package_dir={'TTS': '.'}, - packages=['TTS'] + ['TTS.' + pkg for pkg in find_packages()], + package_dir={'': 'tts_namespace'}, + packages=find_packages('tts_namespace'), cmdclass={ 'build_py': build_py, 'develop': develop, diff --git a/synthesize.py b/synthesize.py index 33a31c69..23c67c73 100644 --- a/synthesize.py +++ b/synthesize.py @@ -4,10 +4,10 @@ import argparse import torch import string -from utils.synthesis import synthesis -from utils.generic_utils import load_config, setup_model -from utils.text.symbols import symbols, phonemes -from utils.audio import AudioProcessor +from TTS.utils.synthesis import synthesis +from TTS.utils.generic_utils import load_config, setup_model +from TTS.utils.text.symbols import symbols, phonemes +from TTS.utils.audio import AudioProcessor def tts(model, diff --git a/tests/generic_utils_text.py b/tests/generic_utils_text.py index 2ef39c09..228df2df 100644 --- a/tests/generic_utils_text.py +++ b/tests/generic_utils_text.py @@ -1,8 +1,8 @@ import unittest import torch as T -from utils.generic_utils import save_checkpoint, save_best_model -from layers.tacotron import Prenet +from TTS.utils.generic_utils import save_checkpoint, save_best_model +from TTS.layers.tacotron import Prenet OUT_PATH = '/tmp/test.pth.tar' diff --git a/tests/symbols_tests.py b/tests/symbols_tests.py index d35e887e..4c32c7d6 100644 --- a/tests/symbols_tests.py +++ b/tests/symbols_tests.py @@ -1,6 +1,6 @@ import unittest -from utils.text import phonemes +from TTS.utils.text import phonemes class SymbolsTest(unittest.TestCase): def test_uniqueness(self): #pylint: disable=no-self-use diff --git a/tests/test_audio.py b/tests/test_audio.py index b2c4a135..fc5deb48 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -1,9 +1,9 @@ import os import unittest -from tests import get_tests_path, get_tests_input_path, get_tests_output_path -from utils.audio import AudioProcessor -from utils.generic_utils import load_config +from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path +from TTS.utils.audio import AudioProcessor +from TTS.utils.generic_utils import load_config TESTS_PATH = get_tests_path() OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index 0d0a3ac6..80c774e8 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -3,10 +3,10 @@ import unittest import torch as T -from server.synthesizer import Synthesizer -from tests import get_tests_input_path, get_tests_output_path, get_tests_path -from utils.text.symbols import phonemes, symbols -from utils.generic_utils import load_config, save_checkpoint, setup_model +from TTS.server.synthesizer import Synthesizer +from TTS.tests import get_tests_input_path, get_tests_output_path +from TTS.utils.text.symbols import phonemes, symbols +from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model class DemoServerTest(unittest.TestCase): diff --git a/tests/test_layers.py b/tests/test_layers.py index 7d9e0650..cf27e30c 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -1,9 +1,9 @@ import unittest import torch as T -from layers.tacotron import Prenet, CBHG, Decoder, Encoder -from layers.losses import L1LossMasked -from utils.generic_utils import sequence_mask +from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder +from TTS.layers.losses import L1LossMasked +from TTS.utils.generic_utils import sequence_mask #pylint: disable=unused-variable diff --git a/tests/test_loader.py b/tests/test_loader.py index 9d151820..fe1cefef 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -5,10 +5,10 @@ import torch import numpy as np from torch.utils.data import DataLoader -from utils.generic_utils import load_config -from utils.audio import AudioProcessor -from datasets import TTSDataset -from datasets.preprocess import ljspeech +from TTS.utils.generic_utils import load_config +from TTS.utils.audio import AudioProcessor +from TTS.datasets import TTSDataset +from TTS.datasets.preprocess import ljspeech #pylint: disable=unused-variable diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py index 6f4b6df1..993ee495 100644 --- a/tests/test_preprocessors.py +++ b/tests/test_preprocessors.py @@ -1,8 +1,8 @@ import unittest import os -from tests import get_tests_input_path +from TTS.tests import get_tests_input_path -from datasets.preprocess import common_voice +from TTS.datasets.preprocess import common_voice class TestPreprocessors(unittest.TestCase): diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index 9ec2d4dc..a26f1ddf 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -6,9 +6,9 @@ import numpy as np from torch import optim from torch import nn -from utils.generic_utils import load_config -from layers.losses import MSELossMasked -from models.tacotron2 import Tacotron2 +from TTS.utils.generic_utils import load_config +from TTS.layers.losses import MSELossMasked +from TTS.models.tacotron2 import Tacotron2 #pylint: disable=unused-variable diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index b44cb58f..acd7af41 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -5,9 +5,9 @@ import unittest from torch import optim from torch import nn -from utils.generic_utils import load_config -from layers.losses import L1LossMasked -from models.tacotron import Tacotron +from TTS.utils.generic_utils import load_config +from TTS.layers.losses import L1LossMasked +from TTS.models.tacotron import Tacotron #pylint: disable=unused-variable diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 62440e47..8f8e6fab 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -1,7 +1,7 @@ import unittest import torch as T -from utils.text import * +from TTS.utils.text import * def test_phoneme_to_sequence(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" diff --git a/train.py b/train.py index 1f808e49..5b5a4b13 100644 --- a/train.py +++ b/train.py @@ -10,24 +10,24 @@ import torch.nn as nn from torch import optim from torch.utils.data import DataLoader -from datasets.TTSDataset import MyDataset +from TTS.datasets.TTSDataset import MyDataset from distribute import (DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor) -from layers.losses import L1LossMasked, MSELossMasked -from utils.audio import AudioProcessor -from utils.generic_utils import (NoamLR, check_update, count_parameters, - create_experiment_folder, get_git_branch, - load_config, remove_experiment_folder, - save_best_model, save_checkpoint, weight_decay, - set_init_dict, copy_config_file, setup_model, - split_dataset, gradual_training_scheduler) -from utils.logger import Logger -from utils.speakers import load_speaker_mapping, save_speaker_mapping, \ +from TTS.layers.losses import L1LossMasked, MSELossMasked +from TTS.utils.audio import AudioProcessor +from TTS.utils.generic_utils import (NoamLR, check_update, count_parameters, + create_experiment_folder, get_git_branch, + load_config, remove_experiment_folder, + save_best_model, save_checkpoint, weight_decay, + set_init_dict, copy_config_file, setup_model, + split_dataset, gradual_training_scheduler) +from TTS.utils.logger import Logger +from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers -from utils.synthesis import synthesis -from utils.text.symbols import phonemes, symbols -from utils.visual import plot_alignment, plot_spectrogram -from datasets.preprocess import get_preprocessor_by_name +from TTS.utils.synthesis import synthesis +from TTS.utils.text.symbols import phonemes, symbols +from TTS.utils.visual import plot_alignment, plot_spectrogram +from TTS.datasets.preprocess import get_preprocessor_by_name torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False diff --git a/tts_namespace/README.md b/tts_namespace/README.md new file mode 100644 index 00000000..c5b2ddbf --- /dev/null +++ b/tts_namespace/README.md @@ -0,0 +1,29 @@ +This folder contains a symlink called TTS to the parent folder: + + lrwxr-xr-x TTS -> .. + +This is used to appease the distribute/setuptools gods. When the project was +initially set up, the repository folder itself was considered a namespace, and +development was done with `sys.path` hacks. This means if you tried to install +TTS, `setup.py` would see the packages `models`, `utils`, `layers`... instead of + `TTS.models`, `TTS.utils`... + +Installing TTS would then pollute the package namespace with generic names like +those above. In order to make things installable in both install and development +modes (`pip install /path/to/TTS` and `pip install -e /path/to/TTS`), we needed +to add an additional 'TTS' namespace to avoid this pollution. A virtual redirect +using `packages_dir` in `setup.py` is not enough because it breaks the editable +installation, which can only handle the simplest of `package_dir` redirects. + +Our solution is to use a symlink in order to add the extra `TTS` namespace. In +`setup.py`, we only look for packages inside `tts_namespace` (this folder), +which contains a symlink called TTS pointing to the repository root. The final +result is that `setuptools.find_packages` will find `TTS.models`, `TTS.utils`... + +With this hack, `pip install -e` will then add a symlink to the `tts_namespace` +in your `site-packages` folder, which works properly. It's important not to add +anything else in this folder because it will pollute the package namespace when +installing the project. + +This does not work if you check out your project on a filesystem that does not +support symlinks. \ No newline at end of file diff --git a/tts_namespace/TTS b/tts_namespace/TTS new file mode 120000 index 00000000..a96aa0ea --- /dev/null +++ b/tts_namespace/TTS @@ -0,0 +1 @@ +.. \ No newline at end of file diff --git a/utils/speakers.py b/utils/speakers.py index a1c273cf..4b11531b 100644 --- a/utils/speakers.py +++ b/utils/speakers.py @@ -1,7 +1,7 @@ import os import json -from datasets.preprocess import get_preprocessor_by_name +from TTS.datasets.preprocess import get_preprocessor_by_name def make_speakers_json_path(out_path): diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 226e2e8d..1c5b98c3 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -3,8 +3,8 @@ import re import phonemizer from phonemizer.phonemize import phonemize -from utils.text import cleaners -from utils.text.symbols import symbols, phonemes, _phoneme_punctuations, _bos, \ +from TTS.utils.text import cleaners +from TTS.utils.text.symbols import symbols, phonemes, _phoneme_punctuations, _bos, \ _eos # Mappings from symbol to numeric ID and vice versa: diff --git a/utils/visual.py b/utils/visual.py index 982fa53a..1ee87cfb 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -2,7 +2,7 @@ import librosa import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt -from utils.text import phoneme_to_sequence, sequence_to_phoneme +from TTS.utils.text import phoneme_to_sequence, sequence_to_phoneme def plot_alignment(alignment, info=None): From 28644a717e8bd8fd9fce644b6874a216b6e2d20f Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 29 Aug 2019 12:11:31 +0200 Subject: [PATCH 51/57] Fix tests --- .travis/script | 2 ++ tests/inputs/server_config.json | 2 +- utils/generic_utils.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.travis/script b/.travis/script index 4aa275be..41a17a4c 100755 --- a/.travis/script +++ b/.travis/script @@ -11,5 +11,7 @@ fi if [[ "$TEST_SUITE" == "unittest" ]]; then # Run tests on all pushes + pushd tts_namespace python -m unittest + popd fi diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json index d3220d7d..8ac266bd 100644 --- a/tests/inputs/server_config.json +++ b/tests/inputs/server_config.json @@ -1,5 +1,5 @@ { - "tts_path":"tests/outputs/", // tts model root folder + "tts_path":"TTS/tests/outputs/", // tts model root folder "tts_file":"checkpoint_10.pth.tar", // tts checkpoint file "tts_config":"dummy_model_config.json", // tts config.json file "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 51155254..1c16834a 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -250,7 +250,7 @@ def set_init_dict(model_dict, checkpoint, c): def setup_model(num_chars, num_speakers, c): print(" > Using model: {}".format(c.model)) - MyModel = importlib.import_module('models.' + c.model.lower()) + MyModel = importlib.import_module('TTS.models.' + c.model.lower()) MyModel = getattr(MyModel, c.model) if c.model.lower() in ["tacotron", "tacotrongst"]: model = MyModel( From 529348d6dc6fa6ffb49585abf01fbae3b8640d58 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 30 Aug 2019 10:29:22 +0200 Subject: [PATCH 52/57] lint fixes --- setup.py | 10 ++--- utils/radam.py | 108 +++++++++++++------------------------------------ 2 files changed, 32 insertions(+), 86 deletions(-) diff --git a/setup.py b/setup.py index 51ee87ae..a8e52739 100644 --- a/setup.py +++ b/setup.py @@ -66,11 +66,11 @@ setup( package_dir={'': 'tts_namespace'}, packages=find_packages('tts_namespace'), project_urls={ - 'Documentation': 'https://github.com/mozilla/TTS/wiki', - 'Tracker': 'https://github.com/mozilla/TTS/issues', - 'Repository': 'https://github.com/mozilla/TTS', - 'Discussions': 'https://discourse.mozilla.org/c/tts', - }, + 'Documentation': 'https://github.com/mozilla/TTS/wiki', + 'Tracker': 'https://github.com/mozilla/TTS/issues', + 'Repository': 'https://github.com/mozilla/TTS', + 'Discussions': 'https://discourse.mozilla.org/c/tts', + }, cmdclass={ 'build_py': build_py, 'develop': develop, diff --git a/utils/radam.py b/utils/radam.py index d3a65dc5..57323541 100644 --- a/utils/radam.py +++ b/utils/radam.py @@ -1,6 +1,7 @@ import math import torch -from torch.optim.optimizer import Optimizer, required +from torch.optim.optimizer import Optimizer + class RAdam(Optimizer): @@ -9,7 +10,7 @@ class RAdam(Optimizer): self.buffer = [[None, None, None] for ind in range(10)] super(RAdam, self).__init__(params, defaults) - def __setstate__(self, state): + def __setstate__(self, state): # pylint: disable= useless-super-delegation super(RAdam, self).__setstate__(state) def step(self, closure=None): @@ -25,19 +26,21 @@ class RAdam(Optimizer): continue grad = p.grad.data.float() if grad.is_sparse: - raise RuntimeError('RAdam does not support sparse gradients') + raise RuntimeError( + 'RAdam does not support sparse gradients') p_data_fp32 = p.data.float() state = self.state[p] - if len(state) == 0: + if not state: state['step'] = 0 state['exp_avg'] = torch.zeros_like(p_data_fp32) state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) else: state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as( + p_data_fp32) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] @@ -53,21 +56,24 @@ class RAdam(Optimizer): buffered[0] = state['step'] beta2_t = beta2 ** state['step'] N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + N_sma = N_sma_max - 2 * \ + state['step'] * beta2_t / (1 - beta2_t) buffered[1] = N_sma # more conservative since it's an approximated value if N_sma >= 5: - step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * ( + N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) else: step_size = group['lr'] / (1 - beta1 ** state['step']) buffered[2] = step_size if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + p_data_fp32.add_(-group['weight_decay'] + * group['lr'], p_data_fp32) # more conservative since it's an approximated value - if N_sma >= 5: + if N_sma >= 5: denom = exp_avg_sq.sqrt().add_(group['eps']) p_data_fp32.addcdiv_(-step_size, exp_avg, denom) else: @@ -77,6 +83,7 @@ class RAdam(Optimizer): return loss + class PlainRAdam(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): @@ -84,7 +91,7 @@ class PlainRAdam(Optimizer): super(PlainRAdam, self).__init__(params, defaults) - def __setstate__(self, state): + def __setstate__(self, state): # pylint: disable= useless-super-delegation super(PlainRAdam, self).__setstate__(state) def step(self, closure=None): @@ -100,19 +107,21 @@ class PlainRAdam(Optimizer): continue grad = p.grad.data.float() if grad.is_sparse: - raise RuntimeError('RAdam does not support sparse gradients') + raise RuntimeError( + 'RAdam does not support sparse gradients') p_data_fp32 = p.data.float() state = self.state[p] - if len(state) == 0: + if not state: state['step'] = 0 state['exp_avg'] = torch.zeros_like(p_data_fp32) state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) else: state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as( + p_data_fp32) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] @@ -126,11 +135,13 @@ class PlainRAdam(Optimizer): N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) + p_data_fp32.add_(-group['weight_decay'] + * group['lr'], p_data_fp32) # more conservative since it's an approximated value - if N_sma >= 5: - step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + if N_sma >= 5: + step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * ( + N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) denom = exp_avg_sq.sqrt().add_(group['eps']) p_data_fp32.addcdiv_(-step_size, exp_avg, denom) else: @@ -140,68 +151,3 @@ class PlainRAdam(Optimizer): p.data.copy_(p_data_fp32) return loss - - -class AdamW(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, warmup = 0): - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, warmup = warmup) - super(AdamW, self).__init__(params, defaults) - - def __setstate__(self, state): - super(AdamW, self).__setstate__(state) - - def step(self, closure=None): - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - exp_avg.mul_(beta1).add_(1 - beta1, grad) - - denom = exp_avg_sq.sqrt().add_(group['eps']) - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - - if group['warmup'] > state['step']: - scheduled_lr = 1e-8 + state['step'] * group['lr'] / group['warmup'] - else: - scheduled_lr = group['lr'] - - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] * scheduled_lr, p_data_fp32) - - p_data_fp32.addcdiv_(-step_size, exp_avg, denom) - - p.data.copy_(p_data_fp32) - - return loss From dc69074a56cfe00ea4b5ce0d7eae9d050a03a122 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 30 Aug 2019 10:33:46 +0200 Subject: [PATCH 53/57] add RADAM reference --- utils/radam.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/radam.py b/utils/radam.py index 57323541..62ecc695 100644 --- a/utils/radam.py +++ b/utils/radam.py @@ -3,6 +3,7 @@ import torch from torch.optim.optimizer import Optimizer +# adapted from https://github.com/LiyuanLucasLiu/RAdam class RAdam(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): From 8ff17dfab1aeab1949d08ee5a74615afd16bbe62 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 5 Sep 2019 12:54:45 +0200 Subject: [PATCH 54/57] setup.py update --- .compute | 4 +- notebooks/Benchmark.ipynb | 501 ++++++++++++++++++++++++++++++-------- setup.py | 3 + 3 files changed, 406 insertions(+), 102 deletions(-) diff --git a/.compute b/.compute index 2dbc7bb2..3e009cae 100644 --- a/.compute +++ b/.compute @@ -10,7 +10,7 @@ wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh python3 setup.py develop # cp -R ${USER_DIR}/GermanData ../tmp/ -python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ +# python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ -# while true; do sleep 1000000; done +while true; do sleep 1000000; done diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb index 81f81641..4de29af9 100644 --- a/notebooks/Benchmark.ipynb +++ b/notebooks/Benchmark.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -29,11 +29,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Populating the interactive namespace from numpy and matplotlib\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/erogol/miniconda3/lib/python3.7/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']\n", + "`%matplotlib` prevents importing * from pylab and numpy\n", + " \"\\n`%matplotlib` prevents importing * from pylab and numpy\"\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", @@ -42,7 +59,6 @@ "import io\n", "import torch \n", "import time\n", - "import json\n", "import numpy as np\n", "from collections import OrderedDict\n", "from matplotlib import pylab as plt\n", @@ -70,23 +86,21 @@ "from IPython.display import Audio\n", "\n", "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'" + "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", + "os.environ['OMP_NUM_THREADS']='1'\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):\n", " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False, speaker_id=speaker_id, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars)\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n", - " # coorect the normalization differences b/w TTS and the Vocoder.\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", - " mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n", " if not use_gl:\n", " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)\n", "\n", @@ -103,18 +117,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mVOCODER_MODEL_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mVOCODER_CONFIG_PATH\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mVOCODER_CONFIG\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mVOCODER_CONFIG_PATH\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0muse_cuda\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/projects/TTS/tts_namespace/TTS/utils/generic_utils.py\u001b[0m in \u001b[0;36mload_config\u001b[0;34m(config_path)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mload_config\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mconfig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAttrDict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconfig_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0minput_str\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'\\\\\\n'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_str\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json'" + ] + } + ], "source": [ "# Set constants\n", "ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5049/'\n", - "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", + "MODEL_PATH = ROOT_PATH + 'best_model.pth.tar'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", + "OUT_FOLDER = \"/home/erogol/Dropbox/AudioSamples/benchmark_samples/\"\n", "CONFIG = load_config(CONFIG_PATH)\n", - "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/universal/4910/best_model_16K.pth.tar\"\n", - "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/universal/4910/config_16K.json\"\n", + "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/model_checkpoints/best_model.pth.tar\"\n", + "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Data/models/wavernn/mozilla/mozilla-May24-4763/config.json\"\n", "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n", "use_cuda = False\n", "\n", @@ -122,8 +149,6 @@ "# CONFIG.windowing = False\n", "# CONFIG.prenet_dropout = False\n", "# CONFIG.separate_stopnet = True\n", - "# CONFIG.use_forward_attn = True\n", - "# CONFIG.forward_attn_mask = True\n", "# CONFIG.stopnet = True\n", "\n", "# Set the vocoder\n", @@ -138,19 +163,11 @@ "outputs": [], "source": [ "# LOAD TTS MODEL\n", - "from TTS.utils.text.symbols import symbols, phonemes\n", - "\n", - "# multi speaker \n", - "if CONFIG.use_speaker_embedding:\n", - " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", - " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", - "else:\n", - " speakers = []\n", - " speaker_id = None\n", + "from utils.text.symbols import symbols, phonemes\n", "\n", "# load the model\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, len(speakers), CONFIG)\n", + "model = setup_model(num_chars, CONFIG)\n", "\n", "# load the audio processor\n", "ap = AudioProcessor(**CONFIG.audio) \n", @@ -167,12 +184,7 @@ "if use_cuda:\n", " model.cuda()\n", "model.eval()\n", - "print(cp['step'])\n", - "print(cp['r'])\n", - "\n", - "# set model stepsize \n", - "if 'r' in cp:\n", - " model.decoder.set_r(cp['r'])" + "print(cp['step'])" ] }, { @@ -184,28 +196,25 @@ "# LOAD WAVERNN\n", "if use_gl == False:\n", " from WaveRNN.models.wavernn import Model\n", - " from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n", " bits = 10\n", - " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n", + "\n", " wavernn = Model(\n", " rnn_dims=512,\n", " fc_dims=512,\n", - " mode=VOCODER_CONFIG.mode,\n", - " mulaw=VOCODER_CONFIG.mulaw,\n", - " pad=VOCODER_CONFIG.pad,\n", - " upsample_factors=VOCODER_CONFIG.upsample_factors,\n", + " mode=\"mold\",\n", + " pad=2,\n", + " upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset\n", " feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n", " compute_dims=128,\n", " res_out_dims=128,\n", " res_blocks=10,\n", - " hop_length=ap_vocoder.hop_length,\n", - " sample_rate=ap_vocoder.sample_rate,\n", - " use_upsample_net = True,\n", - " use_aux_net = True\n", + " hop_length=ap.hop_length,\n", + " sample_rate=ap.sample_rate,\n", " ).cuda()\n", "\n", + "\n", " check = torch.load(VOCODER_MODEL_PATH)\n", - " wavernn.load_state_dict(check['model'], strict=False)\n", + " wavernn.load_state_dict(check['model'])\n", " if use_cuda:\n", " wavernn.cuda()\n", " wavernn.eval();\n", @@ -221,73 +230,111 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'model' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0meval\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_decoder_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m2000\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mspeaker_id\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" + ] + } + ], "source": [ "model.eval()\n", "model.decoder.max_decoder_steps = 2000\n", - "speaker_id = 500\n", - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "speaker_id = 0\n", + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'model' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Be a voice, not an echo.\"\u001b[0m \u001b[0;31m# 'echo' is not in training set.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" + ] + } + ], "source": [ - "model.eval()\n", - "model.decoder.max_decoder_steps = 2000\n", - "sentence = \"Seine Fuerenden Berater hatten Donald Trump seit Wochen beschworen, berichteten US-Medien: Lassen Sie das mit den Zoellen bleiben.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "sentence = \"Der Klimawandel bedroht die Gletscher im Himalaya.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'model' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"The human voice is the most perfect instrument of all.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" + ] + } + ], "source": [ - "sentence = \"Zwei Unternehmen verlieren einem Medienbericht zufolge ihre Verträge als Maut-Inkasso-Manager.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"The human voice is the most perfect instrument of all.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'model' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0msentence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"I'm sorry Dave. I'm afraid I can't do that.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0malign\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstop_tokens\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwav\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msentence\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCONFIG\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cuda\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspeaker_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mspeaker_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_gl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_gl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" + ] + } + ], "source": [ - "sentence = \"Eine Ausländermaut nach dem Geschmack der CSU wird es nicht geben - das bedauert außerhalb der Partei fast niemand.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, "scrolled": true }, "outputs": [], "source": [ - "sentence = \"Angela Merkel ist als Klimakanzlerin gestartet.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"This cake is great. It's so delicious and moist.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { @@ -300,51 +347,76 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, "outputs": [], "source": [ - "sentence = \"Dann vernachlässigte sie das Thema.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, "outputs": [], "source": [ - "sentence = \"Nun, kurz vor dem Ende, will sie damit noch einmal neu anfangen.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, "outputs": [], "source": [ - "sentence = \"Nun ist der Spieltempel pleite, und manchen Dorfbewohnern fehlt das Geld zum Essen.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, "outputs": [], "source": [ - "sentence = \"Andrea Nahles will in der Fraktion die Vertrauensfrage stellen.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, "outputs": [], "source": [ - "sentence=\"Die Erfolge der Grünen bringen eine Reihe Unerfahrener in die Parlamente.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { @@ -357,11 +429,136 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, "outputs": [], "source": [ - "sentence=\"Die Luftfahrtbranche arbeitet daran, CO2-neutral zu werden.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \" He has read the whole thing.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"He reads books.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"Thisss isrealy awhsome.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"This is your internet browser, Firefox.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"This is your internet browser Firefox.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" ] }, { @@ -370,14 +567,118 @@ "metadata": {}, "outputs": [], "source": [ - "sentence=\"Michael Kretschmer versucht seit Monaten, die Bürger zu umgarnen.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + "sentence = \"Eren, how are you?\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hard Sentences" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"Encouraged, he started with a minute a day.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "sentence = \"If he decided to watch TV he really watched it.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [], + "source": [ + "# for twb dataset\n", + "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, "outputs": [], "source": [ "# !zip benchmark_samples/samples.zip benchmark_samples/*" diff --git a/setup.py b/setup.py index a8e52739..f6916741 100644 --- a/setup.py +++ b/setup.py @@ -90,4 +90,7 @@ setup( "soundfile", "phonemizer @ https://github.com/bootphon/phonemizer/tarball/master", ], + dependency_links=[ + "http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer-1.0.1" + ] ) From 0bb8d780e8e5c1bb12b15073af20190f2d7c029c Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 5 Sep 2019 16:48:36 +0200 Subject: [PATCH 55/57] visual.py update --- utils/visual.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/utils/visual.py b/utils/visual.py index 1ee87cfb..825caf52 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -1,3 +1,4 @@ +import torch import librosa import matplotlib matplotlib.use('Agg') @@ -5,10 +6,14 @@ import matplotlib.pyplot as plt from TTS.utils.text import phoneme_to_sequence, sequence_to_phoneme -def plot_alignment(alignment, info=None): - fig, ax = plt.subplots(figsize=(16, 10)) +def plot_alignment(alignment, info=None, fig_size=(16, 10), title=None): + if isinstance(alignment, torch.Tensor): + alignment_ = alignment.detach().cpu().numpy().squeeze() + else: + alignment_ = alignment + fig, ax = plt.subplots(figsize=fig_size) im = ax.imshow( - alignment.T, aspect='auto', origin='lower', interpolation='none') + alignment_.T, aspect='auto', origin='lower', interpolation='none') fig.colorbar(im, ax=ax) xlabel = 'Decoder timestep' if info is not None: @@ -17,12 +22,18 @@ def plot_alignment(alignment, info=None): plt.ylabel('Encoder timestep') # plt.yticks(range(len(text)), list(text)) plt.tight_layout() + if title is not None: + plt.title(title) return fig -def plot_spectrogram(linear_output, audio): - spectrogram = audio._denormalize(linear_output) - fig = plt.figure(figsize=(16, 10)) +def plot_spectrogram(linear_output, audio, fig_size=(16, 10)): + if isinstance(linear_output, torch.Tensor): + linear_output_ = linear_output.detach().cpu().numpy().squeeze() + else: + linear_output_ = linear_output + spectrogram = audio._denormalize(linear_output_) + fig = plt.figure(figsize=fig_size) plt.imshow(spectrogram.T, aspect="auto", origin="lower") plt.colorbar() plt.tight_layout() From 16d1f62afa6031e652abf51ffaffd4f5042a729a Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 5 Sep 2019 16:57:24 +0200 Subject: [PATCH 56/57] remove files --- config_kusal.json | 41 ---------------------- config_libritts.json | 82 -------------------------------------------- server/__init__.py | 0 3 files changed, 123 deletions(-) delete mode 100644 config_kusal.json delete mode 100644 config_libritts.json create mode 100644 server/__init__.py diff --git a/config_kusal.json b/config_kusal.json deleted file mode 100644 index 696171f0..00000000 --- a/config_kusal.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "model_name": "TTS-larger-kusal", - "audio_processor": "audio", - "num_mels": 80, - "num_freq": 1025, - "sample_rate": 22000, - "frame_length_ms": 50, - "frame_shift_ms": 12.5, - "preemphasis": 0.97, - "min_mel_freq": 125, - "max_mel_freq": 7600, - "min_level_db": -100, - "ref_level_db": 20, - "embedding_size": 256, - "text_cleaner": "english_cleaners", - - "epochs": 1000, - "lr": 0.002, - "lr_decay": 0.5, - "decay_step": 100000, - "warmup_steps": 4000, - "batch_size": 32, - "eval_batch_size":-1, - "r": 5, - - "griffin_lim_iters": 60, - "power": 1.5, - - "num_loader_workers": 8, - - "checkpoint": true, - "save_step": 25000, - "print_step": 10, - "run_eval": false, - "data_path": "/snakepit/shared/data/mycroft/kusal/", - "meta_file_train": "prompts.txt", - "meta_file_val": null, - "dataset": "Kusal", - "min_seq_len": 0, - "output_path": "../keep/" -} \ No newline at end of file diff --git a/config_libritts.json b/config_libritts.json deleted file mode 100644 index 658b9835..00000000 --- a/config_libritts.json +++ /dev/null @@ -1,82 +0,0 @@ -{ - "run_name": "libritts-360", - "run_description": "LibriTTS 360 gradual traning with memory queue.", - - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. - "frame_shift_ms": 12.5, // stft window hop-lengh in ms. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": false, // move normalization to range [-1, 1] - "max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - }, - - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54321" - }, - - "reinit_layers": [], - - "model": "Tacotron", // one of the model in models/ - "grad_clip": 1, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "memory_size": 7, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. - "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // "original" or "bn". - "prenet_dropout": true, // enable/disable dropout at prenet. - "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // enable/disable forward attention. In general, it aligns faster. - "forward_attn_mask": false, - "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. - "eval_batch_size":16, - "r": 7, // Number of frames to predict for step. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 10, // Number of steps to log traning on console. - "batch_group_size": 0, //Number of batches to shuffle after bucketing. - - "run_eval": true, - "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - "data_path": "/home/erogol/Data/Libri-TTS/train-clean-360/", // DATASET-RELATED: can overwritten from command argument - "meta_file_train": null, // DATASET-RELATED: metafile for training dataloader. - "meta_file_val": null, // DATASET-RELATED: metafile for evaluation dataloader. - "dataset": "libri_tts", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py - "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 150, // DATASET-RELATED: maximum text length - "output_path": "/media/erogol/data_ssd/Models/libri_tts/", // DATASET-RELATED: output path for all training outputs. - "num_loader_workers": 12, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages - "text_cleaner": "phoneme_cleaners", - "use_speaker_embedding": true -} - diff --git a/server/__init__.py b/server/__init__.py new file mode 100644 index 00000000..e69de29b From d1828c957321d03a6eafa023d0be974448995597 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 10 Sep 2019 12:09:58 +0200 Subject: [PATCH 57/57] fix server tests and pylint --- tests/test_demo_server.py | 1 + utils/visual.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index 80c774e8..5eb3c01c 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -20,5 +20,6 @@ class DemoServerTest(unittest.TestCase): def test_in_out(self): self._create_random_model() config = load_config(os.path.join(get_tests_input_path(), 'server_config.json')) + config['tts_path'] = get_tests_output_path() synthesizer = Synthesizer(config) synthesizer.tts("Better this test works!!") diff --git a/utils/visual.py b/utils/visual.py index 825caf52..ab513666 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -32,7 +32,7 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)): linear_output_ = linear_output.detach().cpu().numpy().squeeze() else: linear_output_ = linear_output - spectrogram = audio._denormalize(linear_output_) + spectrogram = audio._denormalize(linear_output_) # pylint: disable=protected-access fig = plt.figure(figsize=fig_size) plt.imshow(spectrogram.T, aspect="auto", origin="lower") plt.colorbar()