From 57cbb3ab0c3792e055d55b1de47a9145ad793ba7 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 9 Mar 2020 21:02:44 +0100 Subject: [PATCH 001/104] config update for guided attention and normalization --- config.json | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/config.json b/config.json index a113836f..96b6576a 100644 --- a/config.json +++ b/config.json @@ -7,11 +7,11 @@ "audio":{ // Audio processing parameters "num_mels": 80, // size of the mel spec frame. - "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. + "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. "win_length": 1024, // stft window length in ms. "hop_length": 256, // stft window hop-lengh in ms. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "min_level_db": -100, // normalization range "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. "power": 1.5, // value to sharpen wav signals after GL algorithm. @@ -19,7 +19,7 @@ // Normalization parameters "signal_norm": true, // normalize the spec values in range [0, 1] "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! @@ -41,6 +41,7 @@ "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. // VALIDATION "run_eval": true, @@ -69,7 +70,7 @@ "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. // STOPNET @@ -89,13 +90,13 @@ "num_val_loader_workers": 4, // number of evaluation data loader processes. "batch_group_size": 0, //Number of batches to shuffle after bucketing. "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 150, // DATASET-RELATED: maximum text length + "max_seq_len": 153, // DATASET-RELATED: maximum text length // PATHS "output_path": "/home/erogol/Models/", // PHONEMES - "phoneme_cache_path": "mozilla_us_phonemes_2_1", // phoneme computation is slow, therefore, it caches results in the given folder. + "phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages From d83b58e35d0018a6acef3ad739af69986bb1c9d9 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 9 Mar 2020 21:03:18 +0100 Subject: [PATCH 002/104] TTS Loss aggregated all loss functuons --- layers/losses.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/layers/losses.py b/layers/losses.py index 7e5671b2..2cbe6f2a 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -125,3 +125,112 @@ class BCELossMasked(nn.Module): x * mask, target * mask, pos_weight=self.pos_weight, reduction='sum') loss = loss / mask.sum() return loss + + +class GuidedAttentionLoss(torch.nn.Module): + def __init__(self, sigma=0.4): + super(GuidedAttentionLoss, self).__init__() + self.sigma = sigma + + def _make_ga_masks(self, ilens, olens): + B = len(ilens) + max_ilen = max(ilens) + max_olen = max(olens) + ga_masks = torch.zeros((B, max_olen, max_ilen)) + for idx, (ilen, olen) in enumerate(zip(ilens, olens)): + ga_masks[idx, :olen, :ilen] = self._make_ga_mask(ilen, olen, self.sigma) + return ga_masks + + def forward(self, att_ws, ilens, olens): + ga_masks = self._make_ga_masks(ilens, olens).to(att_ws.device) + seq_masks = self._make_masks(ilens, olens).to(att_ws.device) + losses = ga_masks * att_ws + loss = torch.mean(losses.masked_select(seq_masks)) + return loss + + @staticmethod + def _make_ga_mask(ilen, olen, sigma): + grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen)) + grid_x, grid_y = grid_x.float(), grid_y.float() + return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen) ** 2 / (2 * (sigma ** 2))) + + @staticmethod + def _make_masks(ilens, olens): + in_masks = sequence_mask(ilens) + out_masks = sequence_mask(olens) + return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2) + + +class TacotronLoss(torch.nn.Module): + def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4): + super(TacotronLoss, self).__init__() + self.stopnet_pos_weight = stopnet_pos_weight + self.ga_alpha = c.ga_alpha + self.config = c + # postnet decoder loss + if c.loss_masking: + self.criterion = L1LossMasked(c.seq_len_norm) if c.model in [ + "Tacotron" + ] else MSELossMasked(c.seq_len_norm) + else: + self.criterion = nn.L1Loss() if c.model in ["Tacotron" + ] else nn.MSELoss() + # guided attention loss + if c.ga_alpha > 0: + self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma) + # stopnet loss + self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None + + def forward(self, postnet_output, decoder_output, mel_input, linear_input, + stopnet_output, stopnet_target, output_lens, decoder_b_output, + alignments, alignment_lens, input_lens): + + return_dict = {} + # decoder and postnet losses + if self.config.loss_masking: + decoder_loss = self.criterion(decoder_output, mel_input, + output_lens) + if self.config.model in ["Tacotron", "TacotronGST"]: + postnet_loss = self.criterion(postnet_output, linear_input, + output_lens) + else: + postnet_loss = self.criterion(postnet_output, mel_input, + output_lens) + else: + decoder_loss = self.criterion(decoder_output, mel_input) + if self.config.model in ["Tacotron", "TacotronGST"]: + postnet_loss = self.criterion(postnet_output, linear_input) + else: + postnet_loss = self.criterion(postnet_output, mel_input) + loss = decoder_loss + postnet_loss + return_dict['decoder_loss'] = decoder_loss + return_dict['postnet_loss'] = postnet_loss + + # stopnet loss + stop_loss = self.criterion_st( + stopnet_output, stopnet_target, + output_lens) if self.config.stopnet else torch.zeros(1) + if not self.config.separate_stopnet and self.config.stopnet: + loss += stop_loss + return_dict['stopnet_loss'] = stop_loss + + # backward decoder loss (if enabled) + if self.config.bidirectional_decoder: + if self.config.loss_masking: + decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input, output_lens) + else: + decoder_b_loss = self.criterion(torch.flip(decoder_b_output, dims=(1, )), mel_input) + decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_b_output, dims=(1, )), decoder_output) + loss += decoder_b_loss + decoder_c_loss + return_dict['decoder_b_loss'] = decoder_b_loss + return_dict['decoder_c_loss'] = decoder_c_loss + + # guided attention loss (if enabled) + if self.config.ga_alpha > 0: + ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens) + loss += ga_loss * self.ga_alpha + return_dict['ga_loss'] = ga_loss * self.ga_alpha + + return_dict['loss'] = loss + return return_dict + From 796a59d0ccdbea791e2a72040904870ee81c7085 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 9 Mar 2020 21:03:40 +0100 Subject: [PATCH 003/104] Rename in-mel-spec --- tests/test_audio.py | 2 +- train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_audio.py b/tests/test_audio.py index fc5deb48..7f884d37 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -33,7 +33,7 @@ class TestAudio(unittest.TestCase): self.ap.clip_norm = clip_norm wav = self.ap.load_wav(WAV_FILE) mel = self.ap.melspectrogram(wav) - wav_ = self.ap.inv_mel_spectrogram(mel) + wav_ = self.ap.inv_melspectrogram(mel) file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav"\ .format(max_norm, signal_norm, symmetric_norm, clip_norm) print(" | > Creating wav file at : ", file_name) diff --git a/train.py b/train.py index 1397b310..f0e3b68a 100644 --- a/train.py +++ b/train.py @@ -13,7 +13,7 @@ from torch.utils.data import DataLoader from TTS.datasets.TTSDataset import MyDataset from distribute import (DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor) -from TTS.layers.losses import L1LossMasked, MSELossMasked, BCELossMasked +from TTS.layers.losses import TacotronLoss from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import ( NoamLR, check_update, count_parameters, create_experiment_folder, From 032bf312c6ee5c443061d4e4453d5e48e936c958 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 9 Mar 2020 21:04:13 +0100 Subject: [PATCH 004/104] update train.py for guided attention --- train.py | 195 +++++++++++++++++++++---------------------------- utils/audio.py | 2 +- 2 files changed, 86 insertions(+), 111 deletions(-) diff --git a/train.py b/train.py index f0e3b68a..cf073956 100644 --- a/train.py +++ b/train.py @@ -114,7 +114,7 @@ def format_data(data): return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length -def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, +def train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=(epoch == 0)) @@ -132,6 +132,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, if c.bidirectional_decoder: train_values['avg_decoder_b_loss'] = 0 # decoder backward loss train_values['avg_decoder_c_loss'] = 0 # decoder consistency loss + if c.ga_alpha > 0: + train_values['avg_ga_loss'] = 0 # guidede attention loss keep_avg = KeepAverage() keep_avg.add_values(train_values) print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) @@ -164,39 +166,27 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) + decoder_backward_output = None - # loss computation - stop_loss = criterion_st(stop_tokens, - stop_targets, mel_lengths) if c.stopnet else torch.zeros(1) - if c.loss_masking: - decoder_loss = criterion(decoder_output, mel_input, mel_lengths) - if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion(postnet_output, linear_input, - mel_lengths) - else: - postnet_loss = criterion(postnet_output, mel_input, - mel_lengths) + # set the alignment lengths wrt reduction factor for guided attention + if mel_lengths.max() % model.decoder.r != 0: + alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r else: - decoder_loss = criterion(decoder_output, mel_input) - if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion(postnet_output, linear_input) - else: - postnet_loss = criterion(postnet_output, mel_input) - loss = decoder_loss + postnet_loss - if not c.separate_stopnet and c.stopnet: - loss += stop_loss + alignment_lengths = mel_lengths // model.decoder.r - # backward decoder + # compute loss + loss_dict = criterion(postnet_output, decoder_output, mel_input, + linear_input, stop_tokens, stop_targets, + mel_lengths, decoder_backward_output, + alignments, alignment_lengths, text_lengths) if c.bidirectional_decoder: - if c.loss_masking: - decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths) - else: - decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input) - decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output) - loss += decoder_backward_loss + decoder_c_loss - keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()}) + keep_avg.update_values({'avg_decoder_b_loss': loss_dict['decoder_backward_loss'].item(), + 'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item()}) + if c.ga_alpha > 0: + keep_avg.update_values({'avg_ga_loss': loss_dict['ga_loss'].item()}) - loss.backward() + # backward pass + loss_dict['loss'].backward() optimizer, current_lr = adam_weight_decay(optimizer) grad_norm, grad_flag = check_update(model, c.grad_clip, ignore_stopnet=True) optimizer.step() @@ -207,7 +197,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # backpass and check the grad norm for stop loss if c.separate_stopnet: - stop_loss.backward() + loss_dict['stopnet_loss'].backward() optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() @@ -220,36 +210,31 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, if global_step % c.print_step == 0: print( " | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} " - "DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} " + "DecoderLoss:{:.5f} StopLoss:{:.5f} GALoss:{:.5f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( - num_iter, batch_n_iter, global_step, postnet_loss.item(), - decoder_loss.item(), stop_loss.item(), align_score, - grad_norm, grad_norm_st, avg_text_length, avg_spec_length, - step_time, loader_time, current_lr), + num_iter, batch_n_iter, global_step, loss_dict['postnet_loss'].item(), + loss_dict['decoder_loss'].item(), loss_dict['stopnet_loss'].item(), + loss_dict['ga_loss'].item(), grad_norm, grad_norm_st, avg_text_length, + avg_spec_length, step_time, loader_time, current_lr), flush=True) # aggregate losses from processes if num_gpus > 1: - postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) - decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) - loss = reduce_tensor(loss.data, num_gpus) - stop_loss = reduce_tensor(stop_loss.data, - num_gpus) if c.stopnet else stop_loss + loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus) + loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus) + loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus) + loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, + num_gpus) if c.stopnet else loss_dict['stopnet_loss'] if args.rank == 0: update_train_values = { - 'avg_postnet_loss': - float(postnet_loss.item()), - 'avg_decoder_loss': - float(decoder_loss.item()), - 'avg_stop_loss': - stop_loss - if isinstance(stop_loss, float) else float(stop_loss.item()), - 'avg_step_time': - step_time, - 'avg_loader_time': - loader_time + 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), + 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), + 'avg_stop_loss': loss_dict['stopnet_loss'].item() + if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()), + 'avg_step_time': step_time, + 'avg_loader_time': loader_time } keep_avg.update_values(update_train_values) @@ -257,8 +242,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # reduce TB load if global_step % 10 == 0: iter_stats = { - "loss_posnet": postnet_loss.item(), - "loss_decoder": decoder_loss.item(), + "loss_posnet": loss_dict['postnet_loss'].item(), + "loss_decoder": loss_dict['decoder_loss'].item(), "lr": current_lr, "grad_norm": grad_norm, "grad_norm_st": grad_norm_st, @@ -270,7 +255,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, if c.checkpoint: # save model save_checkpoint(model, optimizer, optimizer_st, - postnet_loss.item(), OUT_PATH, global_step, + loss_dict['postnet_loss'].item(), OUT_PATH, global_step, epoch) # Diagnostic visualizations @@ -295,7 +280,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, if c.model in ["Tacotron", "TacotronGST"]: train_audio = ap.inv_spectrogram(const_spec.T) else: - train_audio = ap.inv_mel_spectrogram(const_spec.T) + train_audio = ap.inv_melspectrogram(const_spec.T) tb_logger.tb_train_audios(global_step, {'TrainAudio': train_audio}, c.audio["sample_rate"]) @@ -304,11 +289,11 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # print epoch stats print(" | > EPOCH END -- GlobalStep:{} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " - "AvgStopLoss:{:.5f} AvgAlignScore:{:3f} EpochTime:{:.2f} " + "AvgStopLoss:{:.5f} AvgGALoss:{:3f} EpochTime:{:.2f} " "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format( global_step, keep_avg['avg_postnet_loss'], keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'], - keep_avg['avg_align_score'], epoch_time, + keep_avg['avg_ga_loss'], epoch_time, keep_avg['avg_step_time'], keep_avg['avg_loader_time']), flush=True) # Plot Epoch Stats @@ -321,6 +306,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, "alignment_score": keep_avg['avg_align_score'], "epoch_time": epoch_time } + if c.ga_alpha > 0: + epoch_stats['guided_attention_loss'] = keep_avg['avg_ga_loss'] tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) @@ -328,7 +315,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, @torch.no_grad() -def evaluate(model, criterion, criterion_st, ap, global_step, epoch): +def evaluate(model, criterion, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) @@ -343,6 +330,8 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): if c.bidirectional_decoder: eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss eval_values_dict['avg_decoder_c_loss'] = 0 # decoder consistency loss + if c.ga_alpha > 0: + eval_values_dict['avg_ga_loss'] = 0 # guidede attention loss keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) print("\n > Validation") @@ -362,37 +351,26 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) + decoder_backward_output = None - # loss computation - stop_loss = criterion_st( - stop_tokens, stop_targets, mel_lengths) if c.stopnet else torch.zeros(1) - if c.loss_masking: - decoder_loss = criterion(decoder_output, mel_input, - mel_lengths) - if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion(postnet_output, linear_input, - mel_lengths) - else: - postnet_loss = criterion(postnet_output, mel_input, - mel_lengths) + # set the alignment lengths wrt reduction factor for guided attention + if mel_lengths.max() % model.decoder.r != 0: + alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r else: - decoder_loss = criterion(decoder_output, mel_input) - if c.model in ["Tacotron", "TacotronGST"]: - postnet_loss = criterion(postnet_output, linear_input) - else: - postnet_loss = criterion(postnet_output, mel_input) - loss = decoder_loss + postnet_loss + stop_loss + alignment_lengths = mel_lengths // model.decoder.r - # backward decoder loss + # compute loss + loss_dict = criterion(postnet_output, decoder_output, mel_input, + linear_input, stop_tokens, stop_targets, + mel_lengths, decoder_backward_output, + alignments, alignment_lengths, text_lengths) if c.bidirectional_decoder: - if c.loss_masking: - decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths) - else: - decoder_backward_loss = criterion(torch.flip(decoder_backward_output, dims=(1, )), mel_input) - decoder_c_loss = torch.nn.functional.l1_loss(torch.flip(decoder_backward_output, dims=(1, )), decoder_output) - loss += decoder_backward_loss + decoder_c_loss - keep_avg.update_values({'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item()}) + keep_avg.update_values({'avg_decoder_b_loss': loss_dict['decoder_backward_loss'].item(), + 'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item()}) + if c.ga_alpha > 0: + keep_avg.update_values({'avg_ga_loss': loss_dict['ga_loss'].item()}) + # step time step_time = time.time() - start_time epoch_time += step_time @@ -409,23 +387,27 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): keep_avg.update_values({ 'avg_postnet_loss': - float(postnet_loss.item()), + float(loss_dict['postnet_loss'].item()), 'avg_decoder_loss': - float(decoder_loss.item()), + float(loss_dict['decoder_loss'].item()), 'avg_stop_loss': - float(stop_loss.item()), + float(loss_dict['stopnet_loss'].item()), }) if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " - "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}" - .format(loss.item(), postnet_loss.item(), + "StopLoss: {:.5f} - {:.5f} GALoss: {:.5f} - {:.5f} AlignScore: {:.4f} - {:.4f}" + .format(loss_dict['loss'].item(), + loss_dict['postnet_loss'].item(), keep_avg['avg_postnet_loss'], - decoder_loss.item(), - keep_avg['avg_decoder_loss'], stop_loss.item(), - keep_avg['avg_stop_loss'], align_score, - keep_avg['avg_align_score']), + loss_dict['decoder_loss'].item(), + keep_avg['avg_decoder_loss'], + loss_dict['stopnet_loss'].item(), + keep_avg['avg_stop_loss'], + loss_dict['ga_loss'].item(), + keep_avg['avg_ga_loss'], + align_score, keep_avg['avg_align_score']), flush=True) if args.rank == 0: @@ -447,7 +429,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: - eval_audio = ap.inv_mel_spectrogram(const_spec.T) + eval_audio = ap.inv_melspectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) @@ -456,13 +438,15 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stop_loss": keep_avg['avg_stop_loss'], - "alignment_score": keep_avg['avg_align_score'] + "alignment_score": keep_avg['avg_align_score'], } if c.bidirectional_decoder: epoch_stats['loss_decoder_backward'] = keep_avg['avg_decoder_b_loss'] align_b_img = alignments_backward[idx].data.cpu().numpy() eval_figures['alignment_backward'] = plot_alignment(align_b_img) + if c.ga_alpha > 0: + epoch_stats['guided_attention_loss'] = keep_avg['avg_ga_loss'] tb_logger.tb_eval_stats(global_step, epoch_stats) tb_logger.tb_eval_figures(global_step, eval_figures) @@ -486,7 +470,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: - wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( + wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis( model, test_sentence, c, @@ -565,14 +549,8 @@ def main(args): # pylint: disable=redefined-outer-name else: optimizer_st = None - if c.loss_masking: - criterion = L1LossMasked(c.seq_len_norm) if c.model in ["Tacotron", "TacotronGST" - ] else MSELossMasked(c.seq_len_norm) - else: - criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST" - ] else nn.MSELoss() - criterion_st = BCELossMasked( - pos_weight=torch.tensor(10)) if c.stopnet else None + # setup criterion + criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') @@ -600,8 +578,6 @@ def main(args): # pylint: disable=redefined-outer-name if use_cuda: model.cuda() criterion.cuda() - if criterion_st: - criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: @@ -631,11 +607,10 @@ def main(args): # pylint: disable=redefined-outer-name model.decoder_backward.set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) - train_loss, global_step = train(model, criterion, criterion_st, - optimizer, optimizer_st, scheduler, ap, + train_loss, global_step = train(model, criterion, optimizer, + optimizer_st, scheduler, ap, global_step, epoch) - val_loss = evaluate(model, criterion, criterion_st, ap, global_step, - epoch) + val_loss = evaluate(model, criterion, ap, global_step, epoch) print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) diff --git a/utils/audio.py b/utils/audio.py index 771e6a43..31c39eb2 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -31,7 +31,7 @@ class AudioProcessor(object): **_): print(" > Setting up Audio Processor...") - + # setup class attributed self.sample_rate = sample_rate self.num_mels = num_mels self.min_level_db = min_level_db or 0 From a2e900ef3bffceac4bf6d7d92e4a23ca0ee8c5ed Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 9 Mar 2020 21:05:10 +0100 Subject: [PATCH 005/104] formatting audio.py --- utils/audio.py | 89 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/utils/audio.py b/utils/audio.py index 31c39eb2..0f86b3dd 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -51,6 +51,7 @@ class AudioProcessor(object): self.do_trim_silence = do_trim_silence self.trim_db = trim_db self.sound_norm = sound_norm + # setup stft parameters if hop_length is None: self.n_fft, self.hop_length, self.win_length = self._stft_parameters() else: @@ -61,19 +62,11 @@ class AudioProcessor(object): members = vars(self) for key, value in members.items(): print(" | > {}:{}".format(key, value)) + # create spectrogram utils + self.mel_basis = self._build_mel_basis() + self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) - def save_wav(self, wav, path): - wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) - - def _linear_to_mel(self, spectrogram): - _mel_basis = self._build_mel_basis() - return np.dot(_mel_basis, spectrogram) - - def _mel_to_linear(self, mel_spec): - inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) - return np.maximum(1e-10, np.dot(inv_mel_basis, mel_spec)) - + ### setting up the parameters ### def _build_mel_basis(self, ): if self.mel_fmax is not None: assert self.mel_fmax <= self.sample_rate // 2 @@ -84,6 +77,16 @@ class AudioProcessor(object): fmin=self.mel_fmin, fmax=self.mel_fmax) + def _stft_parameters(self, ): + """Compute necessary stft parameters with given time values""" + n_fft = (self.num_freq - 1) * 2 + factor = self.frame_length_ms / self.frame_shift_ms + assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" + hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) + win_length = int(hop_length * factor) + return n_fft, hop_length, win_length + + ### normalization ### def _normalize(self, S): """Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]""" #pylint: disable=no-else-return @@ -121,23 +124,15 @@ class AudioProcessor(object): else: return S - def _stft_parameters(self, ): - """Compute necessary stft parameters with given time values""" - n_fft = (self.num_freq - 1) * 2 - factor = self.frame_length_ms / self.frame_shift_ms - assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" - hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) - win_length = int(hop_length * factor) - return n_fft, hop_length, win_length - + ### DB and AMP conversion ### def _amp_to_db(self, x): min_level = np.exp(self.min_level_db / 20 * np.log(10)) return 20 * np.log10(np.maximum(min_level, x)) - @staticmethod - def _db_to_amp(x): + def _db_to_amp(self, x): return np.power(10.0, x * 0.05) + ### Preemphasis ### def apply_preemphasis(self, x): if self.preemphasis == 0: raise RuntimeError(" [!] Preemphasis is set 0.0.") @@ -148,6 +143,13 @@ class AudioProcessor(object): raise RuntimeError(" [!] Preemphasis is set 0.0.") return scipy.signal.lfilter([1], [1, -self.preemphasis], x) + ### SPECTROGRAMs ### + def _linear_to_mel(self, spectrogram): + return np.dot(self.mel_basis, spectrogram) + + def _mel_to_linear(self, mel_spec): + return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec)) + def spectrogram(self, y): if self.preemphasis != 0: D = self._stft(self.apply_preemphasis(y)) @@ -167,14 +169,14 @@ class AudioProcessor(object): def inv_spectrogram(self, spectrogram): """Converts spectrogram to waveform using librosa""" S = self._denormalize(spectrogram) - S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear + S = self._db_to_amp(S + self.ref_level_db) # Reconstruct phase if self.preemphasis != 0: return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) return self._griffin_lim(S**self.power) - def inv_mel_spectrogram(self, mel_spectrogram): - '''Converts mel spectrogram to waveform using librosa''' + def inv_melspectrogram(self, mel_spectrogram): + '''Converts melspectrogram to waveform using librosa''' D = self._denormalize(mel_spectrogram) S = self._db_to_amp(D + self.ref_level_db) S = self._mel_to_linear(S) # Convert back to linear @@ -190,15 +192,7 @@ class AudioProcessor(object): mel = self._normalize(S) return mel - def _griffin_lim(self, S): - angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) - S_complex = np.abs(S).astype(np.complex) - y = self._istft(S_complex * angles) - for _ in range(self.griffin_lim_iters): - angles = np.exp(1j * np.angle(self._stft(y))) - y = self._istft(S_complex * angles) - return y - + ### STFT and ISTFT ### def _stft(self, y): return librosa.stft( y=y, @@ -212,6 +206,16 @@ class AudioProcessor(object): return librosa.istft( y, hop_length=self.hop_length, win_length=self.win_length) + def _griffin_lim(self, S): + angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) + S_complex = np.abs(S).astype(np.complex) + y = self._istft(S_complex * angles) + for _ in range(self.griffin_lim_iters): + angles = np.exp(1j * np.angle(self._stft(y))) + y = self._istft(S_complex * angles) + return y + + ### Audio Processing ### def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): window_length = int(self.sample_rate * min_silence_sec) hop_length = int(window_length / 4) @@ -228,6 +232,21 @@ class AudioProcessor(object): return librosa.effects.trim( wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0] + def sound_norm(self, x): + return x / abs(x).max() * 0.9 + + ### save and load ### + def load_wav(self, filename, sr=None): + if sr is None: + x, sr = sf.read(filename) + else: + x, sr = librosa.load(filename, sr=sr) + return x + + def save_wav(self, wav, path): + wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) + scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) + @staticmethod def mulaw_encode(wav, qc): mu = 2 ** qc - 1 From e97bb45abae249b28004f635cd95f38a20de9f28 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 10 Mar 2020 11:06:25 +0100 Subject: [PATCH 006/104] bug fixes and fixing unit tests --- tests/test_loader.py | 5 +++-- train.py | 2 +- utils/synthesis.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_loader.py b/tests/test_loader.py index d8727895..98e0bbce 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -141,7 +141,7 @@ class TestTTSDataset(unittest.TestCase): # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() - wav = self.ap.inv_mel_spectrogram(mel_spec.T) + wav = self.ap.inv_melspectrogram(mel_spec.T) self.ap.save_wav(wav, OUTPATH + '/mel_inv_dataloader.wav') shutil.copy(item_idx[0], OUTPATH + '/mel_target_dataloader.wav') @@ -199,7 +199,8 @@ class TestTTSDataset(unittest.TestCase): # check the second itme in the batch assert linear_input[1 - idx, -1].sum() == 0 assert mel_input[1 - idx, -1].sum() == 0 - assert stop_target[1 - idx, -1] == 1 + assert stop_target[1, mel_lengths[1]-1] == 1 + assert stop_target[1, mel_lengths[1]:].sum() == 0 assert len(mel_lengths.shape) == 1 # check batch zero-frame conditions (zero-frame disabled) diff --git a/train.py b/train.py index cf073956..f4ea6e70 100644 --- a/train.py +++ b/train.py @@ -470,7 +470,7 @@ def evaluate(model, criterion, ap, global_step, epoch): style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: - wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis( + wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, diff --git a/utils/synthesis.py b/utils/synthesis.py index b4512dc6..75778805 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -58,7 +58,7 @@ def inv_spectrogram(postnet_output, ap, CONFIG): if CONFIG.model in ["Tacotron", "TacotronGST"]: wav = ap.inv_spectrogram(postnet_output.T) else: - wav = ap.inv_mel_spectrogram(postnet_output.T) + wav = ap.inv_melspectrogram(postnet_output.T) return wav From 669a2e1d7320d5f11e726af5a90a269ab7f442b9 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 10 Mar 2020 11:30:13 +0100 Subject: [PATCH 007/104] linter fixes --- layers/losses.py | 9 +++++---- train.py | 24 ++++++++++++------------ utils/audio.py | 31 +++++++++++++------------------ 3 files changed, 30 insertions(+), 34 deletions(-) diff --git a/layers/losses.py b/layers/losses.py index 2cbe6f2a..608e247d 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -126,7 +126,7 @@ class BCELossMasked(nn.Module): loss = loss / mask.sum() return loss - + class GuidedAttentionLoss(torch.nn.Module): def __init__(self, sigma=0.4): super(GuidedAttentionLoss, self).__init__() @@ -179,6 +179,7 @@ class TacotronLoss(torch.nn.Module): if c.ga_alpha > 0: self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma) # stopnet loss + # pylint: disable=not-callable self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None def forward(self, postnet_output, decoder_output, mel_input, linear_input, @@ -213,7 +214,7 @@ class TacotronLoss(torch.nn.Module): if not self.config.separate_stopnet and self.config.stopnet: loss += stop_loss return_dict['stopnet_loss'] = stop_loss - + # backward decoder loss (if enabled) if self.config.bidirectional_decoder: if self.config.loss_masking: @@ -224,13 +225,13 @@ class TacotronLoss(torch.nn.Module): loss += decoder_b_loss + decoder_c_loss return_dict['decoder_b_loss'] = decoder_b_loss return_dict['decoder_c_loss'] = decoder_c_loss - + # guided attention loss (if enabled) if self.config.ga_alpha > 0: ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens) loss += ga_loss * self.ga_alpha return_dict['ga_loss'] = ga_loss * self.ga_alpha - + return_dict['loss'] = loss return return_dict diff --git a/train.py b/train.py index f4ea6e70..9e7d2aed 100644 --- a/train.py +++ b/train.py @@ -214,7 +214,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( num_iter, batch_n_iter, global_step, loss_dict['postnet_loss'].item(), - loss_dict['decoder_loss'].item(), loss_dict['stopnet_loss'].item(), + loss_dict['decoder_loss'].item(), loss_dict['stopnet_loss'].item(), loss_dict['ga_loss'].item(), grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, loader_time, current_lr), flush=True) @@ -232,7 +232,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), 'avg_stop_loss': loss_dict['stopnet_loss'].item() - if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()), + if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()), 'avg_step_time': step_time, 'avg_loader_time': loader_time } @@ -399,16 +399,16 @@ def evaluate(model, criterion, ap, global_step, epoch): " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " "StopLoss: {:.5f} - {:.5f} GALoss: {:.5f} - {:.5f} AlignScore: {:.4f} - {:.4f}" .format(loss_dict['loss'].item(), - loss_dict['postnet_loss'].item(), - keep_avg['avg_postnet_loss'], - loss_dict['decoder_loss'].item(), - keep_avg['avg_decoder_loss'], - loss_dict['stopnet_loss'].item(), - keep_avg['avg_stop_loss'], - loss_dict['ga_loss'].item(), - keep_avg['avg_ga_loss'], - align_score, keep_avg['avg_align_score']), - flush=True) + loss_dict['postnet_loss'].item(), + keep_avg['avg_postnet_loss'], + loss_dict['decoder_loss'].item(), + keep_avg['avg_decoder_loss'], + loss_dict['stopnet_loss'].item(), + keep_avg['avg_stop_loss'], + loss_dict['ga_loss'].item(), + keep_avg['avg_ga_loss'], + align_score, keep_avg['avg_align_score']), + flush=Tr ue) if args.rank == 0: # Diagnostic visualizations diff --git a/utils/audio.py b/utils/audio.py index 0f86b3dd..3a91b28c 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -50,7 +50,7 @@ class AudioProcessor(object): self.clip_norm = clip_norm self.do_trim_silence = do_trim_silence self.trim_db = trim_db - self.sound_norm = sound_norm + self.do_sound_norm = sound_norm # setup stft parameters if hop_length is None: self.n_fft, self.hop_length, self.win_length = self._stft_parameters() @@ -232,17 +232,26 @@ class AudioProcessor(object): return librosa.effects.trim( wav, top_db=self.trim_db, frame_length=self.win_length, hop_length=self.hop_length)[0] - def sound_norm(self, x): + @staticmethod + def sound_norm(x): return x / abs(x).max() * 0.9 - ### save and load ### + ### save and load ### def load_wav(self, filename, sr=None): if sr is None: x, sr = sf.read(filename) else: x, sr = librosa.load(filename, sr=sr) + if self.do_trim_silence: + try: + x = self.trim_silence(x) + except ValueError: + print(f' [!] File cannot be trimmed for silence - {filename}') + assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr) + if self.do_sound_norm: + x = self.sound_norm(x) return x - + def save_wav(self, wav, path): wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) @@ -263,20 +272,6 @@ class AudioProcessor(object): x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) return x - def load_wav(self, filename, sr=None): - if sr is None: - x, sr = sf.read(filename) - else: - x, sr = librosa.load(filename, sr=sr) - if self.do_trim_silence: - try: - x = self.trim_silence(x) - except ValueError: - print(f' [!] File cannot be trimmed for silence - {filename}') - assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr) - if self.sound_norm: - x = x / abs(x).max() * 0.9 - return x @staticmethod def encode_16bits(x): From 201f04d3b3787d6f813f729962ad3c8be881a9fe Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 10 Mar 2020 13:53:04 +0100 Subject: [PATCH 008/104] dropout graves attention heads to decorrelate and prevent overpowering of a single head --- layers/common_layers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/layers/common_layers.py b/layers/common_layers.py index 592f017c..8b7ed125 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -164,6 +164,9 @@ class GravesAttention(nn.Module): b_t = gbk_t[:, 1, :] k_t = gbk_t[:, 2, :] + # dropout to decorrelate attention heads + g_t = torch.nn.functional.dropout(g_t, p=0.5, training=self.training) + # attention GMM parameters sig_t = torch.nn.functional.softplus(b_t) + self.eps From 912cfb206848701630ea98565d363384606e4a66 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 10 Mar 2020 13:58:31 +0100 Subject: [PATCH 009/104] add template config and remove de sentences --- config_template.json | 134 +++++++++++++++++++++++++++++++++++++++++++ de_sentences.txt | 4 -- 2 files changed, 134 insertions(+), 4 deletions(-) create mode 100644 config_template.json delete mode 100644 de_sentences.txt diff --git a/config_template.json b/config_template.json new file mode 100644 index 00000000..e525ec31 --- /dev/null +++ b/config_template.json @@ -0,0 +1,134 @@ +{ + "model": "Tacotron2", // one of the model in models/ + "run_name": "ljspeech-stft_params", + "run_description": "tacotron2 cosntant stf parameters", + + // AUDIO PARAMETERS + "audio":{ + // Audio processing parameters + "num_mels": 80, // size of the mel spec frame. + "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60 // threshold for timming silence. Set this according to your dataset. + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + "characters":{ + "pad": "_", + "eos": "~", + "bos": "^", + "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + "punctuations":"!'(),-.:;? ", + "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":16, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": false, // use noam warmup and lr schedule. + "grad_clip": 1.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + // TACOTRON PRENET + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "prenet_type": "original", // "original" or "bn". + "prenet_dropout": true, // enable/disable dropout at prenet. + + // ATTENTION + "attention_type": "original", // 'original' or 'graves' + "attention_heads": 4, // number of attention heads (only for 'graves') + "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. + "windowing": false, // Enables attention windowing. Used only in eval mode. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. + "transition_agent": false, // enable/disable transition agent of forward attention. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. + + // STOPNET + "stopnet": true, // Train stopnet predicting the end of synthesis. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 153, // DATASET-RELATED: maximum text length + + // PATHS + "output_path": "/data4/rw/home/Trainings/", + + // PHONEMES + "phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. + "use_gst": false, // TACOTRON ONLY: use global style tokens + + // DATASETS + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "/root/LJSpeech-1.1/", + "meta_file_train": "metadata.csv", + "meta_file_val": null + } + ] + +} + diff --git a/de_sentences.txt b/de_sentences.txt deleted file mode 100644 index 7c7651d8..00000000 --- a/de_sentences.txt +++ /dev/null @@ -1,4 +0,0 @@ -Herzlieb, fragte er noch einmal, ist Papa wohl? -Eine große Ueberraschung. -Dann gab ihm sein kleines zärtliches Herz plötzlich ein, beide Aermchen um den Hals der Mutter zu schlingen und sie wieder und wieder zu küssen und seine weiche. -als ob sie ihn nie mehr von sich lassen wollte, und weinte bitterlich. \ No newline at end of file From cbcdec83da59453644a7f518d6c9a7c8dd5c8b3f Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 10 Mar 2020 14:00:12 +0100 Subject: [PATCH 010/104] remove redundant files --- best_model_config.json | 31 ------------------------------- debug_config.json | 26 -------------------------- test_cluster.py | 1 - 3 files changed, 58 deletions(-) delete mode 100644 best_model_config.json delete mode 100644 debug_config.json delete mode 100644 test_cluster.py diff --git a/best_model_config.json b/best_model_config.json deleted file mode 100644 index c62e88cb..00000000 --- a/best_model_config.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "model_name": "best-model", - "num_mels": 80, - "num_freq": 1025, - "sample_rate": 20000, - "frame_length_ms": 50, - "frame_shift_ms": 12.5, - "preemphasis": 0.97, - "min_level_db": -100, - "ref_level_db": 20, - "embedding_size": 256, - "text_cleaner": "english_cleaners", - - "epochs": 1000, - "lr": 0.002, - "warmup_steps": 4000, - "batch_size": 32, - "eval_batch_size":32, - "r": 5, - - "griffin_lim_iters": 60, - "power": 1.5, - - "num_loader_workers": 8, - - "checkpoint": true, - "save_step": 376, - "data_path": "/run/shm/erogol/LJSpeech-1.0", - "min_seq_len": 0, - "output_path": "/data/shared/erogol_models/" -} diff --git a/debug_config.json b/debug_config.json deleted file mode 100644 index 51f08ce8..00000000 --- a/debug_config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "num_mels": 80, - "num_freq": 1024, - "sample_rate": 20000, - "frame_length_ms": 50.0, - "frame_shift_ms": 12.5, - "preemphasis": 0.97, - "min_level_db": -100, - "ref_level_db": 20, - "hidden_size": 128, - "embedding_size": 256, - "text_cleaner": "english_cleaners", - "epochs": 200, - "lr": 0.01, - "lr_patience": 2, - "lr_decay": 0.5, - "batch_size": 32, - "griffinf_lim_iters": 60, - "power": 1.5, - "r": 5, - "num_loader_workers": 16, - "save_step": 1, - "data_path": "/data/shared/KeithIto/LJSpeech-1.0", - "output_path": "result", - "log_dir": "/home/erogol/projects/TTS/logs/" -} diff --git a/test_cluster.py b/test_cluster.py deleted file mode 100644 index daeeedc3..00000000 --- a/test_cluster.py +++ /dev/null @@ -1 +0,0 @@ -print("Python is running!!") From 3472a41255f02a9ac367e617f94183bc1811f623 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 10 Mar 2020 18:17:35 +0100 Subject: [PATCH 011/104] make it optional to load linear specs in dataloader and fix tests respectively --- datasets/TTSDataset.py | 18 ++++++++++++------ tests/test_loader.py | 1 + train.py | 1 + 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index d3a6f486..ae75f3cf 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -13,6 +13,7 @@ class MyDataset(Dataset): def __init__(self, outputs_per_step, text_cleaner, + compute_linear_spec, ap, meta_data, tp=None, @@ -28,6 +29,7 @@ class MyDataset(Dataset): Args: outputs_per_step (int): number of time frames predicted per step. text_cleaner (str): text cleaner used for the dataset. + compute_linear_spec (bool): compute linear spectrogram if True. ap (TTS.utils.AudioProcessor): audio processor object. meta_data (list): list of dataset instances. batch_group_size (int): (0) range of batch randomization after sorting @@ -47,6 +49,7 @@ class MyDataset(Dataset): self.outputs_per_step = outputs_per_step self.sample_rate = ap.sample_rate self.cleaners = text_cleaner + self.compute_linear_spec = compute_linear_spec self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap @@ -193,7 +196,6 @@ class MyDataset(Dataset): # compute features mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] - linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] for m in mel] @@ -208,25 +210,29 @@ class MyDataset(Dataset): # PAD sequences with longest instance in the batch text = prepare_data(text).astype(np.int32) - wav = prepare_data(wav) # PAD features with longest instance - linear = prepare_tensor(linear, self.outputs_per_step) mel = prepare_tensor(mel, self.outputs_per_step) - assert mel.shape[2] == linear.shape[2] # B x D x T --> B x T x D - linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) - linear = torch.FloatTensor(linear).contiguous() mel = torch.FloatTensor(mel).contiguous() mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) + # compute linear spectrogram + if self.compute_linear_spec: + linear = [self.ap.spectrogram(w).astype('float32') for w in wav] + linear = prepare_tensor(linear, self.outputs_per_step) + linear = linear.transpose(0, 2, 1) + assert mel.shape[1] == linear.shape[1] + linear = torch.FloatTensor(linear).contiguous() + else: + linear = None return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \ stop_targets, item_idxs diff --git a/tests/test_loader.py b/tests/test_loader.py index f2bec24c..447c7b38 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -36,6 +36,7 @@ class TestTTSDataset(unittest.TestCase): dataset = TTSDataset.MyDataset( r, c.text_cleaner, + compute_linear_spec=True, ap=self.ap, meta_data=items, tp=c.characters if 'characters' in c.keys() else None, diff --git a/train.py b/train.py index b3a0589b..15c65f64 100644 --- a/train.py +++ b/train.py @@ -47,6 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): dataset = MyDataset( r, c.text_cleaner, + compute_linear_spec=True if c.model.lower() is 'tacotron' else False meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, tp=c.characters if 'characters' in c.keys() else None, From 2a15e391669f9073ba10ef7ff20bb54ec5246977 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 10 Mar 2020 22:38:51 +0100 Subject: [PATCH 012/104] bug fix and run desc in tensorboard --- config.json | 4 ++-- train.py | 7 +++++-- utils/logger.py | 3 +++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/config.json b/config.json index e525ec31..efc96c9e 100644 --- a/config.json +++ b/config.json @@ -1,7 +1,7 @@ { "model": "Tacotron2", // one of the model in models/ - "run_name": "ljspeech-stft_params", - "run_description": "tacotron2 cosntant stf parameters", + "run_name": "ljspeech", + "run_description": "tacotron2 with guided attention and -1 1 normalization and no preemphasis", // AUDIO PARAMETERS "audio":{ diff --git a/train.py b/train.py index 15c65f64..ea6d391c 100644 --- a/train.py +++ b/train.py @@ -47,7 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): dataset = MyDataset( r, c.text_cleaner, - compute_linear_spec=True if c.model.lower() is 'tacotron' else False + compute_linear_spec=True if c.model.lower() is 'tacotron' else False, meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, tp=c.characters if 'characters' in c.keys() else None, @@ -410,7 +410,7 @@ def evaluate(model, criterion, ap, global_step, epoch): loss_dict['ga_loss'].item(), keep_avg['avg_ga_loss'], align_score, keep_avg['avg_align_score']), - flush=Tr ue) + flush=True) if args.rank == 0: # Diagnostic visualizations @@ -696,6 +696,9 @@ if __name__ == '__main__': LOG_DIR = OUT_PATH tb_logger = Logger(LOG_DIR) + # write model desc to tensorboard + tb_logger.tb_add_text('model-description', c['run_description'], 0) + try: main(args) except KeyboardInterrupt: diff --git a/utils/logger.py b/utils/logger.py index 51a10422..e5faeda4 100644 --- a/utils/logger.py +++ b/utils/logger.py @@ -75,3 +75,6 @@ class Logger(object): def tb_test_figures(self, step, figures): self.dict_to_tb_figure("TestFigures", figures, step) + + def tb_add_text(self, title, text, step): + self.writer.add_text(title, text, step) From c5540d80cc4e8918a93833d389c2a83ddf98afca Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 12 Mar 2020 03:27:24 +0100 Subject: [PATCH 013/104] bug fix --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index b3a0589b..d2509c72 100644 --- a/train.py +++ b/train.py @@ -409,7 +409,7 @@ def evaluate(model, criterion, ap, global_step, epoch): loss_dict['ga_loss'].item(), keep_avg['avg_ga_loss'], align_score, keep_avg['avg_align_score']), - flush=Tr ue) + flush=True) if args.rank == 0: # Diagnostic visualizations From 069c8e43151f332cde6c7a9ce795e8238a7bb97e Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 12:43:38 +0100 Subject: [PATCH 014/104] update compute_statistics.py --- compute_statistics.py | 79 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100755 compute_statistics.py diff --git a/compute_statistics.py b/compute_statistics.py new file mode 100755 index 00000000..bbedf7af --- /dev/null +++ b/compute_statistics.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse + +import numpy as np +from tqdm import tqdm + +from TTS.datasets.preprocess import load_meta_data +from TTS.utils.generic_utils import load_config +from TTS.utils.audio import AudioProcessor + +def main(): + """Run preprocessing process.""" + parser = argparse.ArgumentParser( + description="Compute mean and variance of spectrogtram features.") + parser.add_argument("--config_path", type=str, required=True, + help="TTS config file path.") + parser.add_argument("--out_path", default=None, type=str, + help="directory to save the output file.") + args = parser.parse_args() + + # load config + CONFIG = load_config(args.config_path) + CONFIG.audio['signal_norm'] = False # do not apply earlier normalization + CONFIG.audio['stats_path'] = None # discard pre-defined stats + + # load audio processor + ap = AudioProcessor(**CONFIG.audio) + + # load the meta data of target dataset + dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data + print(f" > There are {len(dataset_items)} files.") + + mel_sum = 0 + mel_square_sum = 0 + linear_sum = 0 + linear_square_sum = 0 + N = 0 + for item in tqdm(dataset_items): + # compute features + wav = ap.load_wav(item[1]) + linear = ap.spectrogram(wav) + mel = ap.melspectrogram(wav) + + # compute stats + N += mel.shape[1] + mel_sum += mel.sum(1) + linear_sum += linear.sum(1) + mel_square_sum += (mel ** 2).sum(axis=1) + linear_square_sum += (linear ** 2).sum(axis=1) + + mel_mean = mel_sum / N + mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) + linear_mean = linear_sum / N + linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) + + output_file_path = os.path.join(args.out_path, "scale_stats.npy") + stats = {} + stats['mel_mean'] = mel_mean + stats['mel_std'] = mel_scale + stats['linear_mean'] = linear_mean + stats['linear_std'] = linear_scale + + # set default config values for mean-var scaling + CONFIG.audio['stats_path'] = output_file_path + CONFIG.audio['signal_norm'] = True + # remove redundant values + del CONFIG.audio['max_norm'] + del CONFIG.audio['min_level_db'] + del CONFIG.audio['symmetric_norm'] + del CONFIG.audio['clip_norm'] + stats['audio_config'] = CONFIG.audio + np.save(output_file_path, stats, allow_pickle=True) + + +if __name__ == "__main__": + main() From 0ee1dd54a377e2062fd98141410f26a75ddcc213 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 12:44:18 +0100 Subject: [PATCH 015/104] config update for mean-var scaling --- config.json | 60 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/config.json b/config.json index efc96c9e..1b497646 100644 --- a/config.json +++ b/config.json @@ -1,45 +1,55 @@ { - "model": "Tacotron2", // one of the model in models/ + "model": "Tacotron2", "run_name": "ljspeech", "run_description": "tacotron2 with guided attention and -1 1 normalization and no preemphasis", // AUDIO PARAMETERS "audio":{ + // stft parameters + "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "min_level_db": -100, // normalization range + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim "power": 1.5, // value to sharpen wav signals after GL algorithm. "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. + "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, // VOCABULARY PARAMETERS // if custom character set is not defined, // default set in symbols.py is used - "characters":{ - "pad": "_", - "eos": "~", - "bos": "^", - "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", - "punctuations":"!'(),-.:;? ", - "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" - }, + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, // DISTRIBUTED TRAINING "distributed":{ @@ -107,7 +117,7 @@ "max_seq_len": 153, // DATASET-RELATED: maximum text length // PATHS - "output_path": "/data4/rw/home/Trainings/", + "output_path": "/home/erogol/Models/LJSpeech/", // PHONEMES "phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. @@ -124,7 +134,7 @@ [ { "name": "ljspeech", - "path": "/root/LJSpeech-1.1/", + "path": "/home/erogol/Data/LJSpeech-1.1/", "meta_file_train": "metadata.csv", "meta_file_val": null } From 141797b6ae44aa714adc70974c40204e0d4fa861 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:23:25 +0100 Subject: [PATCH 016/104] write model description to tensorboard --- train.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index ea6d391c..62800c7b 100644 --- a/train.py +++ b/train.py @@ -692,12 +692,11 @@ if __name__ == '__main__': os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) - if args.rank == 0: LOG_DIR = OUT_PATH tb_logger = Logger(LOG_DIR) - # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + # write model desc to tensorboard + tb_logger.tb_add_text('model-description', c['run_description'], 0) try: main(args) From acccac72f5ffa8aa199dc49858fef7cfb64003c2 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:24:30 +0100 Subject: [PATCH 017/104] update test attention notebooks --- notebooks/TestAttention.ipynb | 43 +++++++---------------------------- 1 file changed, 8 insertions(+), 35 deletions(-) diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index 9d3e5e75..b350b070 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -40,19 +40,12 @@ "import IPython\n", "from IPython.display import Audio\n", "\n", - "os.environ['CUDA_VISIBLE_DEVICES']='2'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", + "\n", "def tts(model, text, CONFIG, use_cuda, ap):\n", " t_1 = time.time()\n", " # run the model\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", " # plotting\n", @@ -66,18 +59,11 @@ " file_name = text[:200].replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", " out_path = os.path.join(OUT_FOLDER, file_name)\n", " ap.save_wav(waveform, out_path)\n", - " return attn_score" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " return attn_score\n", + "\n", "# Set constants\n", - "ROOT_PATH = '/data/rw/pit/keep/ljspeech-December-11-2019_04+32PM-ca49ae8/'\n", - "MODEL_PATH = ROOT_PATH + '/checkpoint_410000.pth.tar'\n", + "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/'\n", + "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", "OUT_FOLDER = './hard_sentences/'\n", "CONFIG = load_config(CONFIG_PATH)\n", @@ -148,26 +134,13 @@ "outputs": [], "source": [ "model.decoder.max_decoder_steps=3000\n", - "model.decoder.prenet.train()\n", "attn_scores = []\n", "with open(SENTENCES_PATH, 'r') as f:\n", " for text in f:\n", - " try:\n", - " attn_score = tts(model, text, CONFIG, use_cuda, ap)\n", - " except ValueError:\n", - " attn_score = 0\n", + " attn_score = tts(model, text, CONFIG, use_cuda, ap)\n", " attn_scores.append(attn_score)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np.mean(attn_scores)" - ] - }, { "cell_type": "code", "execution_count": null, From d1e9f8dff1845c3871f59c91bd0cfc98ab8f4b6d Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:26:46 +0100 Subject: [PATCH 018/104] testing mean-var scalingand updating test config --- tests/test_audio.py | 21 +++++++++++++++++++++ tests/test_config.json | 12 +++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/tests/test_audio.py b/tests/test_audio.py index 7f884d37..f006e63e 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -140,3 +140,24 @@ class TestAudio(unittest.TestCase): assert x_norm.min() < 0, x_norm.min() x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3 + + def test_scaler(self): + scaler_stats_path = os.path.join(get_tests_input_path(), 'scale_stats.npy') + conf.audio['stats_path'] = scaler_stats_path + conf.audio['preemphasis'] = 0.0 + conf.audio['do_trim_silence'] = True + conf.audio['signal_norm'] = True + + ap = AudioProcessor(**conf.audio) + mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) + ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) + + self.ap.signal_norm = False + self.ap.preemphasis = 0.0 + + # test scaler forward and backward transforms + wav = self.ap.load_wav(WAV_FILE) + mel_reference = self.ap.melspectrogram(wav) + mel_norm = ap.melspectrogram(wav) + mel_denorm = ap._denormalize(mel_norm) + assert abs(mel_reference - mel_denorm).max() < 1e-4 \ No newline at end of file diff --git a/tests/test_config.json b/tests/test_config.json index 6d63e6ab..e9cd48cf 100644 --- a/tests/test_config.json +++ b/tests/test_config.json @@ -2,10 +2,12 @@ "audio":{ "audio_processor": "audio", // to use dictate different audio processors, if available. "num_mels": 80, // size of the mel spec frame. - "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. + "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. - "frame_shift_ms": 12.5, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms. + "frame_shift_ms": null, // stft window hop-lengh in ms. + "hop_length": 256, + "win_length": 1024, "preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "min_level_db": -100, // normalization range "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. @@ -15,8 +17,8 @@ "symmetric_norm": true, // move normalization to range [-1, 1] "clip_norm": true, // clip normalized values into the range. "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "mel_fmin": 95, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 7600, // maximum freq level for mel-spec. Tune for dataset!! + "mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!! "do_trim_silence": false }, From 92ebec01b150bf018555c1b0dc9763ee24781d35 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:27:25 +0100 Subject: [PATCH 019/104] changes of audio.py for mean-vat scaling --- utils/audio.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/utils/audio.py b/utils/audio.py index 3a91b28c..b7499bd2 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -4,6 +4,8 @@ import numpy as np import scipy.io import scipy.signal +from TTS.utils.data import StandardScaler + class AudioProcessor(object): def __init__(self, @@ -28,6 +30,7 @@ class AudioProcessor(object): do_trim_silence=False, trim_db=60, sound_norm=False, + stats_path=None, **_): print(" > Setting up Audio Processor...") @@ -51,6 +54,7 @@ class AudioProcessor(object): self.do_trim_silence = do_trim_silence self.trim_db = trim_db self.do_sound_norm = sound_norm + self.stats_path = stats_path # setup stft parameters if hop_length is None: self.n_fft, self.hop_length, self.win_length = self._stft_parameters() @@ -65,6 +69,14 @@ class AudioProcessor(object): # create spectrogram utils self.mel_basis = self._build_mel_basis() self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) + # setup scaler + if stats_path: + mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path) + self.setup_scaler(mel_mean, mel_std, linear_mean,linear_std) + self.signal_norm = True + self.max_norm = None + self.clip_norm = None + self.symmetric_norm = None ### setting up the parameters ### def _build_mel_basis(self, ): @@ -85,12 +97,22 @@ class AudioProcessor(object): hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) win_length = int(hop_length * factor) return n_fft, hop_length, win_length - + ### normalization ### def _normalize(self, S): """Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]""" #pylint: disable=no-else-return + S = S.copy() if self.signal_norm: + # mean-var scaling + if hasattr(self, 'mel_scaler'): + if S.shape[0] == self.num_mels: + return self.mel_scaler.transform(S.T).T + elif S.shape[0] == self.n_fft / 2: + return self.linear_scaler.transform(S.T).T + else: + raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') + # range normalization S_norm = ((S - self.min_level_db) / - self.min_level_db) if self.symmetric_norm: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm @@ -108,8 +130,16 @@ class AudioProcessor(object): def _denormalize(self, S): """denormalize values""" #pylint: disable=no-else-return - S_denorm = S + S_denorm = S.copy() if self.signal_norm: + # mean-var scaling + if hasattr(self, 'mel_scaler'): + if S_denorm.shape[0] == self.num_mels: + return self.mel_scaler.inverse_transform(S_denorm.T).T + elif S_denorm.shape[0] == self.n_fft / 2: + return self.linear_scaler.inverse_transform(S_denorm.T).T + else: + raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') if self.symmetric_norm: if self.clip_norm: S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) @@ -122,12 +152,35 @@ class AudioProcessor(object): self.max_norm) + self.min_level_db return S_denorm else: - return S + return S_denorm + + ### Mean-STD scaling ### + def load_stats(self, stats_path): + stats = np.load(stats_path, allow_pickle=True).item() + mel_mean = stats['mel_mean'] + mel_std = stats['mel_std'] + linear_mean = stats['linear_mean'] + linear_std = stats['linear_std'] + stats_config = stats['audio_config'] + # check all audio parameters used for computing stats + skip_parameters = ['griffin_lim_iters', 'stats_path'] + for key in stats_config.keys(): + if key in skip_parameters: + continue + assert stats_config[key] == self.__dict__[ + key], f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + return mel_mean, mel_std, linear_mean, linear_std, stats_config + + # pylint: disable=attribute-defined-outside-init + def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std): + self.mel_scaler = StandardScaler() + self.mel_scaler.set_stats(mel_mean, mel_std) + self.linear_scaler = StandardScaler() + self.linear_scaler.set_stats(linear_mean, linear_std) ### DB and AMP conversion ### def _amp_to_db(self, x): - min_level = np.exp(self.min_level_db / 20 * np.log(10)) - return 20 * np.log10(np.maximum(min_level, x)) + return 20 * np.log10(np.maximum(1e-5, x)) def _db_to_amp(self, x): return np.power(10.0, x * 0.05) From d7cf34ca34ff55a95b63d657470cba1611eea1b7 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:27:53 +0100 Subject: [PATCH 020/104] StandardScaler added --- utils/data.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/utils/data.py b/utils/data.py index f2d7538a..a83325cb 100644 --- a/utils/data.py +++ b/utils/data.py @@ -50,3 +50,28 @@ def pad_per_step(inputs, pad_len): inputs, [[0, 0], [0, 0], [0, pad_len]], mode='constant', constant_values=0.0) + + +# pylint: disable=attribute-defined-outside-init +class StandardScaler(): + + def set_stats(self, mean, scale): + self.mean_ = mean + self.scale_ = scale + + def reset_stats(self): + delattr(self, 'mean_') + delattr(self, 'scale_') + + def transform(self, X): + X = np.asarray(X) + X -= self.mean_ + X /= self.scale_ + return X + + def inverse_transform(self, X): + X = np.asarray(X) + X *= self.scale_ + X += self.mean_ + return X + From 3bbeb43f5770adf1fb310c24bba2fd017b3a79c7 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:28:15 +0100 Subject: [PATCH 021/104] visualization updates wrt mean-var scaling --- utils/visual.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utils/visual.py b/utils/visual.py index 1cb9ac5d..b0db7b04 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -32,22 +32,22 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)): linear_output_ = linear_output.detach().cpu().numpy().squeeze() else: linear_output_ = linear_output - spectrogram = audio._denormalize(linear_output_) # pylint: disable=protected-access + spectrogram = audio._denormalize(linear_output_.T) # pylint: disable=protected-access fig = plt.figure(figsize=fig_size) - plt.imshow(spectrogram.T, aspect="auto", origin="lower") + plt.imshow(spectrogram, aspect="auto", origin="lower") plt.colorbar() plt.tight_layout() return fig -def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None): +def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None, figsize=[8, 24]): if spectrogram is not None: num_plot = 4 else: num_plot = 3 label_fontsize = 16 - fig = plt.figure(figsize=(8, 24)) + fig = plt.figure(figsize=figsize) plt.subplot(num_plot, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) From 568c743632a342ce2f9f420190f72161d7621f84 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 12:43:38 +0100 Subject: [PATCH 022/104] update compute_statistics.py --- compute_statistics.py | 79 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100755 compute_statistics.py diff --git a/compute_statistics.py b/compute_statistics.py new file mode 100755 index 00000000..bbedf7af --- /dev/null +++ b/compute_statistics.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import argparse + +import numpy as np +from tqdm import tqdm + +from TTS.datasets.preprocess import load_meta_data +from TTS.utils.generic_utils import load_config +from TTS.utils.audio import AudioProcessor + +def main(): + """Run preprocessing process.""" + parser = argparse.ArgumentParser( + description="Compute mean and variance of spectrogtram features.") + parser.add_argument("--config_path", type=str, required=True, + help="TTS config file path.") + parser.add_argument("--out_path", default=None, type=str, + help="directory to save the output file.") + args = parser.parse_args() + + # load config + CONFIG = load_config(args.config_path) + CONFIG.audio['signal_norm'] = False # do not apply earlier normalization + CONFIG.audio['stats_path'] = None # discard pre-defined stats + + # load audio processor + ap = AudioProcessor(**CONFIG.audio) + + # load the meta data of target dataset + dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data + print(f" > There are {len(dataset_items)} files.") + + mel_sum = 0 + mel_square_sum = 0 + linear_sum = 0 + linear_square_sum = 0 + N = 0 + for item in tqdm(dataset_items): + # compute features + wav = ap.load_wav(item[1]) + linear = ap.spectrogram(wav) + mel = ap.melspectrogram(wav) + + # compute stats + N += mel.shape[1] + mel_sum += mel.sum(1) + linear_sum += linear.sum(1) + mel_square_sum += (mel ** 2).sum(axis=1) + linear_square_sum += (linear ** 2).sum(axis=1) + + mel_mean = mel_sum / N + mel_scale = np.sqrt(mel_square_sum / N - mel_mean ** 2) + linear_mean = linear_sum / N + linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) + + output_file_path = os.path.join(args.out_path, "scale_stats.npy") + stats = {} + stats['mel_mean'] = mel_mean + stats['mel_std'] = mel_scale + stats['linear_mean'] = linear_mean + stats['linear_std'] = linear_scale + + # set default config values for mean-var scaling + CONFIG.audio['stats_path'] = output_file_path + CONFIG.audio['signal_norm'] = True + # remove redundant values + del CONFIG.audio['max_norm'] + del CONFIG.audio['min_level_db'] + del CONFIG.audio['symmetric_norm'] + del CONFIG.audio['clip_norm'] + stats['audio_config'] = CONFIG.audio + np.save(output_file_path, stats, allow_pickle=True) + + +if __name__ == "__main__": + main() From 25bcbe28879ee578ca21b32b46d3ea3c648499f5 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 12:44:18 +0100 Subject: [PATCH 023/104] config update for mean-var scaling --- config.json | 60 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/config.json b/config.json index efc96c9e..1b497646 100644 --- a/config.json +++ b/config.json @@ -1,45 +1,55 @@ { - "model": "Tacotron2", // one of the model in models/ + "model": "Tacotron2", "run_name": "ljspeech", "run_description": "tacotron2 with guided attention and -1 1 normalization and no preemphasis", // AUDIO PARAMETERS "audio":{ + // stft parameters + "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "min_level_db": -100, // normalization range + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // Griffin-Lim "power": 1.5, // value to sharpen wav signals after GL algorithm. "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. + "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, // VOCABULARY PARAMETERS // if custom character set is not defined, // default set in symbols.py is used - "characters":{ - "pad": "_", - "eos": "~", - "bos": "^", - "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", - "punctuations":"!'(),-.:;? ", - "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" - }, + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, // DISTRIBUTED TRAINING "distributed":{ @@ -107,7 +117,7 @@ "max_seq_len": 153, // DATASET-RELATED: maximum text length // PATHS - "output_path": "/data4/rw/home/Trainings/", + "output_path": "/home/erogol/Models/LJSpeech/", // PHONEMES "phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. @@ -124,7 +134,7 @@ [ { "name": "ljspeech", - "path": "/root/LJSpeech-1.1/", + "path": "/home/erogol/Data/LJSpeech-1.1/", "meta_file_train": "metadata.csv", "meta_file_val": null } From 40cb4a53a67fac2b13256b27acfa00ad2baba3f8 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:24:30 +0100 Subject: [PATCH 024/104] update test attention notebooks --- notebooks/TestAttention.ipynb | 43 +++++++---------------------------- 1 file changed, 8 insertions(+), 35 deletions(-) diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index 9d3e5e75..b350b070 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -40,19 +40,12 @@ "import IPython\n", "from IPython.display import Audio\n", "\n", - "os.environ['CUDA_VISIBLE_DEVICES']='2'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", + "\n", "def tts(model, text, CONFIG, use_cuda, ap):\n", " t_1 = time.time()\n", " # run the model\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", " # plotting\n", @@ -66,18 +59,11 @@ " file_name = text[:200].replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", " out_path = os.path.join(OUT_FOLDER, file_name)\n", " ap.save_wav(waveform, out_path)\n", - " return attn_score" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " return attn_score\n", + "\n", "# Set constants\n", - "ROOT_PATH = '/data/rw/pit/keep/ljspeech-December-11-2019_04+32PM-ca49ae8/'\n", - "MODEL_PATH = ROOT_PATH + '/checkpoint_410000.pth.tar'\n", + "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/'\n", + "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", "OUT_FOLDER = './hard_sentences/'\n", "CONFIG = load_config(CONFIG_PATH)\n", @@ -148,26 +134,13 @@ "outputs": [], "source": [ "model.decoder.max_decoder_steps=3000\n", - "model.decoder.prenet.train()\n", "attn_scores = []\n", "with open(SENTENCES_PATH, 'r') as f:\n", " for text in f:\n", - " try:\n", - " attn_score = tts(model, text, CONFIG, use_cuda, ap)\n", - " except ValueError:\n", - " attn_score = 0\n", + " attn_score = tts(model, text, CONFIG, use_cuda, ap)\n", " attn_scores.append(attn_score)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np.mean(attn_scores)" - ] - }, { "cell_type": "code", "execution_count": null, From 52b0dc39a64053a718422184e665fca13e5c8b94 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:26:46 +0100 Subject: [PATCH 025/104] testing mean-var scalingand updating test config --- tests/test_audio.py | 21 +++++++++++++++++++++ tests/test_config.json | 12 +++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/tests/test_audio.py b/tests/test_audio.py index 7f884d37..f006e63e 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -140,3 +140,24 @@ class TestAudio(unittest.TestCase): assert x_norm.min() < 0, x_norm.min() x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3 + + def test_scaler(self): + scaler_stats_path = os.path.join(get_tests_input_path(), 'scale_stats.npy') + conf.audio['stats_path'] = scaler_stats_path + conf.audio['preemphasis'] = 0.0 + conf.audio['do_trim_silence'] = True + conf.audio['signal_norm'] = True + + ap = AudioProcessor(**conf.audio) + mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path) + ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) + + self.ap.signal_norm = False + self.ap.preemphasis = 0.0 + + # test scaler forward and backward transforms + wav = self.ap.load_wav(WAV_FILE) + mel_reference = self.ap.melspectrogram(wav) + mel_norm = ap.melspectrogram(wav) + mel_denorm = ap._denormalize(mel_norm) + assert abs(mel_reference - mel_denorm).max() < 1e-4 \ No newline at end of file diff --git a/tests/test_config.json b/tests/test_config.json index 6d63e6ab..e9cd48cf 100644 --- a/tests/test_config.json +++ b/tests/test_config.json @@ -2,10 +2,12 @@ "audio":{ "audio_processor": "audio", // to use dictate different audio processors, if available. "num_mels": 80, // size of the mel spec frame. - "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. + "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. "sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled. - "frame_length_ms": 50, // stft window length in ms. - "frame_shift_ms": 12.5, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms. + "frame_shift_ms": null, // stft window hop-lengh in ms. + "hop_length": 256, + "win_length": 1024, "preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "min_level_db": -100, // normalization range "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. @@ -15,8 +17,8 @@ "symmetric_norm": true, // move normalization to range [-1, 1] "clip_norm": true, // clip normalized values into the range. "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "mel_fmin": 95, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 7600, // maximum freq level for mel-spec. Tune for dataset!! + "mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!! "do_trim_silence": false }, From cef9f06887fb0c7819fea68ec203d3d3dbf564c4 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:27:25 +0100 Subject: [PATCH 026/104] changes of audio.py for mean-vat scaling --- utils/audio.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/utils/audio.py b/utils/audio.py index 3a91b28c..b7499bd2 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -4,6 +4,8 @@ import numpy as np import scipy.io import scipy.signal +from TTS.utils.data import StandardScaler + class AudioProcessor(object): def __init__(self, @@ -28,6 +30,7 @@ class AudioProcessor(object): do_trim_silence=False, trim_db=60, sound_norm=False, + stats_path=None, **_): print(" > Setting up Audio Processor...") @@ -51,6 +54,7 @@ class AudioProcessor(object): self.do_trim_silence = do_trim_silence self.trim_db = trim_db self.do_sound_norm = sound_norm + self.stats_path = stats_path # setup stft parameters if hop_length is None: self.n_fft, self.hop_length, self.win_length = self._stft_parameters() @@ -65,6 +69,14 @@ class AudioProcessor(object): # create spectrogram utils self.mel_basis = self._build_mel_basis() self.inv_mel_basis = np.linalg.pinv(self._build_mel_basis()) + # setup scaler + if stats_path: + mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path) + self.setup_scaler(mel_mean, mel_std, linear_mean,linear_std) + self.signal_norm = True + self.max_norm = None + self.clip_norm = None + self.symmetric_norm = None ### setting up the parameters ### def _build_mel_basis(self, ): @@ -85,12 +97,22 @@ class AudioProcessor(object): hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) win_length = int(hop_length * factor) return n_fft, hop_length, win_length - + ### normalization ### def _normalize(self, S): """Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]""" #pylint: disable=no-else-return + S = S.copy() if self.signal_norm: + # mean-var scaling + if hasattr(self, 'mel_scaler'): + if S.shape[0] == self.num_mels: + return self.mel_scaler.transform(S.T).T + elif S.shape[0] == self.n_fft / 2: + return self.linear_scaler.transform(S.T).T + else: + raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') + # range normalization S_norm = ((S - self.min_level_db) / - self.min_level_db) if self.symmetric_norm: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm @@ -108,8 +130,16 @@ class AudioProcessor(object): def _denormalize(self, S): """denormalize values""" #pylint: disable=no-else-return - S_denorm = S + S_denorm = S.copy() if self.signal_norm: + # mean-var scaling + if hasattr(self, 'mel_scaler'): + if S_denorm.shape[0] == self.num_mels: + return self.mel_scaler.inverse_transform(S_denorm.T).T + elif S_denorm.shape[0] == self.n_fft / 2: + return self.linear_scaler.inverse_transform(S_denorm.T).T + else: + raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') if self.symmetric_norm: if self.clip_norm: S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) @@ -122,12 +152,35 @@ class AudioProcessor(object): self.max_norm) + self.min_level_db return S_denorm else: - return S + return S_denorm + + ### Mean-STD scaling ### + def load_stats(self, stats_path): + stats = np.load(stats_path, allow_pickle=True).item() + mel_mean = stats['mel_mean'] + mel_std = stats['mel_std'] + linear_mean = stats['linear_mean'] + linear_std = stats['linear_std'] + stats_config = stats['audio_config'] + # check all audio parameters used for computing stats + skip_parameters = ['griffin_lim_iters', 'stats_path'] + for key in stats_config.keys(): + if key in skip_parameters: + continue + assert stats_config[key] == self.__dict__[ + key], f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + return mel_mean, mel_std, linear_mean, linear_std, stats_config + + # pylint: disable=attribute-defined-outside-init + def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std): + self.mel_scaler = StandardScaler() + self.mel_scaler.set_stats(mel_mean, mel_std) + self.linear_scaler = StandardScaler() + self.linear_scaler.set_stats(linear_mean, linear_std) ### DB and AMP conversion ### def _amp_to_db(self, x): - min_level = np.exp(self.min_level_db / 20 * np.log(10)) - return 20 * np.log10(np.maximum(min_level, x)) + return 20 * np.log10(np.maximum(1e-5, x)) def _db_to_amp(self, x): return np.power(10.0, x * 0.05) From 77f36b65b86042585617212e336b2f371985b0cb Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:27:53 +0100 Subject: [PATCH 027/104] StandardScaler added --- utils/data.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/utils/data.py b/utils/data.py index f2d7538a..a83325cb 100644 --- a/utils/data.py +++ b/utils/data.py @@ -50,3 +50,28 @@ def pad_per_step(inputs, pad_len): inputs, [[0, 0], [0, 0], [0, pad_len]], mode='constant', constant_values=0.0) + + +# pylint: disable=attribute-defined-outside-init +class StandardScaler(): + + def set_stats(self, mean, scale): + self.mean_ = mean + self.scale_ = scale + + def reset_stats(self): + delattr(self, 'mean_') + delattr(self, 'scale_') + + def transform(self, X): + X = np.asarray(X) + X -= self.mean_ + X /= self.scale_ + return X + + def inverse_transform(self, X): + X = np.asarray(X) + X *= self.scale_ + X += self.mean_ + return X + From fd9f469ddc74ec058e1424b52baff9433b2690c0 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 13:28:15 +0100 Subject: [PATCH 028/104] visualization updates wrt mean-var scaling --- utils/visual.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utils/visual.py b/utils/visual.py index 1cb9ac5d..b0db7b04 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -32,22 +32,22 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)): linear_output_ = linear_output.detach().cpu().numpy().squeeze() else: linear_output_ = linear_output - spectrogram = audio._denormalize(linear_output_) # pylint: disable=protected-access + spectrogram = audio._denormalize(linear_output_.T) # pylint: disable=protected-access fig = plt.figure(figsize=fig_size) - plt.imshow(spectrogram.T, aspect="auto", origin="lower") + plt.imshow(spectrogram, aspect="auto", origin="lower") plt.colorbar() plt.tight_layout() return fig -def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None): +def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None, figsize=[8, 24]): if spectrogram is not None: num_plot = 4 else: num_plot = 3 label_fontsize = 16 - fig = plt.figure(figsize=(8, 24)) + fig = plt.figure(figsize=figsize) plt.subplot(num_plot, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) From fa795347a9c2fa269429e6c4266ddaff733c3a77 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 14:47:59 +0100 Subject: [PATCH 029/104] turkish cleaner and data preprocessor --- datasets/preprocess.py | 18 ++++++++++++++++++ utils/text/cleaners.py | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 029922d3..ce876edc 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -187,3 +187,21 @@ def libri_tts(root_path, meta_files=None): for item in items: assert os.path.exists(item[1]), f" [!] wav file is not exist - {item[1]}" return items + + +def custom_turkish(root_path, meta_file): + txt_file = os.path.join(root_path, meta_file) + items = [] + speaker_name = "turkish-female" + skipped_files = [] + with open(txt_file, 'r', encoding='utf-8') as ttf: + for line in ttf: + cols = line.split('|') + wav_file = os.path.join(root_path, 'wavs', cols[0].strip() + '.wav') + if not os.path.exists(wav_file): + skipped_files.append(wav_file) + continue + text = cols[1].strip() + items.append([text, wav_file, speaker_name]) + print(f" [!] {len(skipped_files)} files skipped. They are not exist...") + return items diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py index e6b611b4..92c2d934 100644 --- a/utils/text/cleaners.py +++ b/utils/text/cleaners.py @@ -91,6 +91,14 @@ def transliteration_cleaners(text): return text +# TODO: elaborate it +def basic_turkish_cleaners(text): + '''Pipeline for Turkish text''' + text = text.replace("I", "ı") + text = lowercase(text) + text = collapse_whitespace(text) + + def english_cleaners(text): '''Pipeline for English text, including number and abbreviation expansion.''' text = convert_to_ascii(text) From b9df54adcd496fc2f618f5f3c0d3d25519bdfcfb Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 15:05:13 +0100 Subject: [PATCH 030/104] bug fix and check cleaner config field by comparing with the list of avail functions --- utils/generic_utils.py | 5 +++-- utils/text/cleaners.py | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index f6c38530..9af0bb70 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -496,7 +496,8 @@ def check_config(c): _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) # dataloading - _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=['english_cleaners', 'phoneme_cleaners', 'transliteration_cleaners', 'basic_cleaners']) + from TTS.utils.text import cleaners + _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners)) _check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) _check_argument('num_loader_workers', c, restricted=True, val_type=int, min_val=0) _check_argument('num_val_loader_workers', c, restricted=True, val_type=int, min_val=0) @@ -518,4 +519,4 @@ def check_config(c): _check_argument('name', dataset_entry, restricted=True, val_type=str) _check_argument('path', dataset_entry, restricted=True, val_type=str) _check_argument('meta_file_train', dataset_entry, restricted=True, val_type=str) - _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) \ No newline at end of file + _check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py index 92c2d934..35da8aef 100644 --- a/utils/text/cleaners.py +++ b/utils/text/cleaners.py @@ -97,6 +97,7 @@ def basic_turkish_cleaners(text): text = text.replace("I", "ı") text = lowercase(text) text = collapse_whitespace(text) + return text def english_cleaners(text): From 5223678a4bd295088b20d25953f701eb365ad71b Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 18:22:55 +0100 Subject: [PATCH 031/104] update audio tests more verbose --- tests/test_audio.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/tests/test_audio.py b/tests/test_audio.py index f006e63e..9cc849b5 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -56,6 +56,7 @@ class TestAudio(unittest.TestCase): """Check normalization and denormalization for range values and consistency """ print(" > Testing normalization and denormalization.") wav = self.ap.load_wav(WAV_FILE) + wav = self.ap.sound_norm(wav) # normalize audio to get abetter normalization range below. self.ap.signal_norm = False x = self.ap.melspectrogram(wav) x_old = x @@ -65,7 +66,7 @@ class TestAudio(unittest.TestCase): self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) - print(x_norm.max(), " -- ", x_norm.min()) + print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() @@ -79,7 +80,9 @@ class TestAudio(unittest.TestCase): self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) - print(x_norm.max(), " -- ", x_norm.min()) + print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") + + assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() @@ -93,7 +96,9 @@ class TestAudio(unittest.TestCase): self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) - print(x_norm.max(), " -- ", x_norm.min()) + print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") + + assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() @@ -108,7 +113,9 @@ class TestAudio(unittest.TestCase): self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) - print(x_norm.max(), " -- ", x_norm.min()) + print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") + + assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() @@ -122,7 +129,9 @@ class TestAudio(unittest.TestCase): self.ap.symmetric_norm = False self.ap.max_norm = 1.0 x_norm = self.ap._normalize(x) - print(x_norm.max(), " -- ", x_norm.min()) + print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") + + assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() @@ -133,7 +142,9 @@ class TestAudio(unittest.TestCase): self.ap.symmetric_norm = True self.ap.max_norm = 1.0 x_norm = self.ap._normalize(x) - print(x_norm.max(), " -- ", x_norm.min()) + print(f" > MaxNorm: {self.ap.max_norm}, ClipNorm:{self.ap.clip_norm}, SymmetricNorm:{self.ap.symmetric_norm}, SignalNorm:{self.ap.signal_norm} Range-> {x_norm.max()} -- {x_norm.min()}") + + assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() From 20dd509430a6b1e1faf20577fb217f00e4eb0afe Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Mar 2020 18:24:05 +0100 Subject: [PATCH 032/104] change the way how ref_level_db is handled in audio.py --- utils/audio.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/utils/audio.py b/utils/audio.py index b7499bd2..f06e6d4a 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -113,6 +113,7 @@ class AudioProcessor(object): else: raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') # range normalization + S -= self.ref_level_db # discard certain range of DB assuming it is air noise S_norm = ((S - self.min_level_db) / - self.min_level_db) if self.symmetric_norm: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm @@ -144,13 +145,13 @@ class AudioProcessor(object): if self.clip_norm: S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db - return S_denorm + return S_denorm + self.ref_level_db else: if self.clip_norm: S_denorm = np.clip(S_denorm, 0, self.max_norm) S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db - return S_denorm + return S_denorm + self.ref_level_db else: return S_denorm @@ -208,7 +209,7 @@ class AudioProcessor(object): D = self._stft(self.apply_preemphasis(y)) else: D = self._stft(y) - S = self._amp_to_db(np.abs(D)) - self.ref_level_db + S = self._amp_to_db(np.abs(D)) return self._normalize(S) def melspectrogram(self, y): @@ -216,13 +217,13 @@ class AudioProcessor(object): D = self._stft(self.apply_preemphasis(y)) else: D = self._stft(y) - S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db + S = self._amp_to_db(self._linear_to_mel(np.abs(D))) return self._normalize(S) def inv_spectrogram(self, spectrogram): """Converts spectrogram to waveform using librosa""" S = self._denormalize(spectrogram) - S = self._db_to_amp(S + self.ref_level_db) + S = self._db_to_amp(S) # Reconstruct phase if self.preemphasis != 0: return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) @@ -231,7 +232,7 @@ class AudioProcessor(object): def inv_melspectrogram(self, mel_spectrogram): '''Converts melspectrogram to waveform using librosa''' D = self._denormalize(mel_spectrogram) - S = self._db_to_amp(D + self.ref_level_db) + S = self._db_to_amp(D) S = self._mel_to_linear(S) # Convert back to linear if self.preemphasis != 0: return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) @@ -239,9 +240,9 @@ class AudioProcessor(object): def out_linear_to_mel(self, linear_spec): S = self._denormalize(linear_spec) - S = self._db_to_amp(S + self.ref_level_db) + S = self._db_to_amp(S) S = self._linear_to_mel(np.abs(S)) - S = self._amp_to_db(S) - self.ref_level_db + S = self._amp_to_db(S) mel = self._normalize(S) return mel From 745cc4e20a083709a034fb3512b00590f273888c Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 24 Mar 2020 01:30:46 +0100 Subject: [PATCH 033/104] audio.py updates --- utils/audio.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/audio.py b/utils/audio.py index f06e6d4a..6ed8493e 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -29,7 +29,7 @@ class AudioProcessor(object): griffin_lim_iters=None, do_trim_silence=False, trim_db=60, - sound_norm=False, + do_sound_norm=False, stats_path=None, **_): @@ -53,7 +53,7 @@ class AudioProcessor(object): self.clip_norm = clip_norm self.do_trim_silence = do_trim_silence self.trim_db = trim_db - self.do_sound_norm = sound_norm + self.do_sound_norm = do_sound_norm self.stats_path = stats_path # setup stft parameters if hop_length is None: @@ -164,7 +164,7 @@ class AudioProcessor(object): linear_std = stats['linear_std'] stats_config = stats['audio_config'] # check all audio parameters used for computing stats - skip_parameters = ['griffin_lim_iters', 'stats_path'] + skip_parameters = ['griffin_lim_iters', 'stats_path', 'do_trim_silence', 'ref_level_db', 'power'] for key in stats_config.keys(): if key in skip_parameters: continue From 9915d79173bcdaa1838a611cc51e101220b853f3 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 24 Mar 2020 01:30:58 +0100 Subject: [PATCH 034/104] return inputs with synthesis --- utils/synthesis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/synthesis.py b/utils/synthesis.py index ffe9920a..9158ef02 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -137,4 +137,4 @@ def synthesis(model, # trim silence if do_trim_silence: wav = trim_silence(wav, ap) - return wav, alignment, decoder_output, postnet_output, stop_tokens + return wav, alignment, decoder_output, postnet_output, stop_tokens, inputs From 52c0b4e3e11327d71783cefc07a2d388d35df5fd Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 25 Mar 2020 01:56:29 +0100 Subject: [PATCH 035/104] bug fix addinf gmissing output for synthesis --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 62800c7b..0bd1f2af 100644 --- a/train.py +++ b/train.py @@ -472,7 +472,7 @@ def evaluate(model, criterion, ap, global_step, epoch): style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: - wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( + wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis( model, test_sentence, c, From d5efe040f75ff595880388dfa452df9a015b730e Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 26 Mar 2020 21:10:37 +0100 Subject: [PATCH 036/104] compute stft paddings to correct wav and spec alignment aespecially for vocoder training --- utils/audio.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/utils/audio.py b/utils/audio.py index 6ed8493e..63204cde 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -114,7 +114,7 @@ class AudioProcessor(object): raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') # range normalization S -= self.ref_level_db # discard certain range of DB assuming it is air noise - S_norm = ((S - self.min_level_db) / - self.min_level_db) + S_norm = ((S - self.min_level_db) / (-self.min_level_db)) if self.symmetric_norm: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm if self.clip_norm: @@ -269,7 +269,17 @@ class AudioProcessor(object): y = self._istft(S_complex * angles) return y - ### Audio Processing ### + def compute_stft_paddings(x, fsize, fshift, pad_sides=1): + '''compute right padding (final frame) or both sides padding (first and final frames) + ''' + assert pad_sides in (1, 2) + # return int(fsize // 2) + pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] + if pad_sides == 1: + return 0, pad + else: + return pad // 2, pad // 2 + pad % 2Processing ### + def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): window_length = int(self.sample_rate * min_silence_sec) hop_length = int(window_length / 4) From a678d684a26d2fb543aec9b79160419e4c677d67 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 27 Mar 2020 14:17:03 +0100 Subject: [PATCH 037/104] bug fix --- utils/audio.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/audio.py b/utils/audio.py index 63204cde..67110134 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -278,8 +278,9 @@ class AudioProcessor(object): if pad_sides == 1: return 0, pad else: - return pad // 2, pad // 2 + pad % 2Processing ### + return pad // 2, pad // 2 + pad % 2 + ### Audio Processing ### def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): window_length = int(self.sample_rate * min_silence_sec) hop_length = int(window_length / 4) From 391dab45f024016dcb625ef8b8182f32c7a6aae1 Mon Sep 17 00:00:00 2001 From: erogol Date: Sun, 29 Mar 2020 23:07:12 +0200 Subject: [PATCH 038/104] update ExtractTTSSpecs notebook --- notebooks/ExtractTTSpectrogram.ipynb | 97 +++++++++++++++++----------- utils/audio.py | 5 +- 2 files changed, 63 insertions(+), 39 deletions(-) diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index b5a88611..c747c764 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -7,15 +7,6 @@ "This is a notebook to generate mel-spectrograms from a TTS model to be used for WaveRNN training." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TTS_PATH = \"/home/erogol/projects/\"" - ] - }, { "cell_type": "code", "execution_count": null, @@ -26,7 +17,6 @@ "%autoreload 2\n", "import os\n", "import sys\n", - "sys.path.append(TTS_PATH)\n", "import torch\n", "import importlib\n", "import numpy as np\n", @@ -42,7 +32,7 @@ "%matplotlib inline\n", "\n", "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES']='2'" + "os.environ['CUDA_VISIBLE_DEVICES']='0'" ] }, { @@ -69,12 +59,12 @@ "metadata": {}, "outputs": [], "source": [ - "OUT_PATH = \"/data/rw/pit/data/turkish-vocoder/\"\n", - "DATA_PATH = \"/data/rw/home/Turkish\"\n", + "OUT_PATH = \"/home/erogol/Data/LJSpeech-1.1/ljspeech-March-17-2020_01+16AM-871588c/\"\n", + "DATA_PATH = \"/home/erogol/Data/LJSpeech-1.1/\"\n", "DATASET = \"ljspeech\"\n", - "METADATA_FILE = \"metadata.txt\"\n", - "CONFIG_PATH = \"/data/rw/pit/keep/turkish-January-08-2020_01+56AM-ca5e133/config.json\"\n", - "MODEL_FILE = \"/data/rw/pit/keep/turkish-January-08-2020_01+56AM-ca5e133/checkpoint_255000.pth.tar\"\n", + "METADATA_FILE = \"metadata.csv\"\n", + "CONFIG_PATH = \"/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/config.json\"\n", + "MODEL_FILE = \"/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/checkpoint_420000.pth.tar\"\n", "BATCH_SIZE = 32\n", "\n", "QUANTIZED_WAV = False\n", @@ -85,6 +75,7 @@ "print(\" > CUDA enabled: \", use_cuda)\n", "\n", "C = load_config(CONFIG_PATH)\n", + "C.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel specs with the wav files\n", "ap = AudioProcessor(bits=QUANTIZE_BIT, **C.audio)" ] }, @@ -94,7 +85,7 @@ "metadata": {}, "outputs": [], "source": [ - "# if the vocabulary was passed, replace the default\n", + "# if the vocabulary was passed, replace the default\n", "if 'characters' in C.keys():\n", " symbols, phonemes = make_symbols(**C.characters)\n", "\n", @@ -120,7 +111,7 @@ "preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", "preprocessor = getattr(preprocessor, DATASET.lower())\n", "meta_data = preprocessor(DATA_PATH,METADATA_FILE)\n", - "dataset = MyDataset(checkpoint['r'], C.text_cleaner, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", + "dataset = MyDataset(checkpoint['r'], C.text_cleaner, False, ap, meta_data,tp=C.characters if 'characters' in C.keys() else None, use_phonemes=C.use_phonemes, phoneme_cache_path=C.phoneme_cache_path, enable_eos_bos=C.enable_eos_bos_chars)\n", "loader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=4, collate_fn=dataset.collate_fn, shuffle=False, drop_last=False)" ] }, @@ -143,7 +134,7 @@ "metadata = []\n", "losses = []\n", "postnet_losses = []\n", - "criterion = L1LossMasked()\n", + "criterion = L1LossMasked(seq_len_norm=C.seq_len_norm)\n", "with torch.no_grad():\n", " for data in tqdm(loader):\n", " # setup input data\n", @@ -232,7 +223,31 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Check model performance" + "### Sanity Check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "idx = 1\n", + "ap.melspectrogram(ap.load_wav(item_idx[idx])).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import soundfile as sf\n", + "wav, sr = sf.read(item_idx[idx])\n", + "mel_postnet = postnet_outputs[idx][:mel_lengths[idx], :]\n", + "mel_decoder = mel_outputs[idx][:mel_lengths[idx], :].detach().cpu().numpy()\n", + "mel_truth = ap.melspectrogram(wav)\n", + "print(mel_truth.shape)" ] }, { @@ -242,10 +257,8 @@ "outputs": [], "source": [ "# plot posnet output\n", - "idx = 1\n", - "mel_example = postnet_outputs[idx]\n", - "plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n", - "print(mel_example[:mel_lengths[1], :].shape)" + "plot_spectrogram(mel_postnet, ap);\n", + "print(mel_postnet[:mel_lengths[idx], :].shape)" ] }, { @@ -255,9 +268,8 @@ "outputs": [], "source": [ "# plot decoder output\n", - "mel_example = mel_outputs[idx].data.cpu().numpy()\n", - "plot_spectrogram(mel_example[:mel_lengths[idx], :], ap);\n", - "print(mel_example[:mel_lengths[1], :].shape)" + "plot_spectrogram(mel_decoder, ap);\n", + "print(mel_decoder.shape)" ] }, { @@ -267,10 +279,8 @@ "outputs": [], "source": [ "# plot GT specgrogram\n", - "wav = ap.load_wav(item_idx[idx])\n", - "melt = ap.melspectrogram(wav)\n", - "print(melt.shape)\n", - "plot_spectrogram(melt.T, ap);" + "print(mel_truth.shape)\n", + "plot_spectrogram(mel_truth.T, ap);" ] }, { @@ -281,9 +291,9 @@ "source": [ "# postnet, decoder diff\n", "from matplotlib import pylab as plt\n", - "mel_diff = mel_outputs[idx] - postnet_outputs[idx]\n", + "mel_diff = mel_decoder - mel_postnet\n", "plt.figure(figsize=(16, 10))\n", - "plt.imshow(abs(mel_diff.detach().cpu().numpy()[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.imshow(abs(mel_diff[:mel_lengths[idx],:]).T,aspect=\"auto\", origin=\"lower\");\n", "plt.colorbar()\n", "plt.tight_layout()" ] @@ -294,10 +304,25 @@ "metadata": {}, "outputs": [], "source": [ + "# PLOT GT SPECTROGRAM diff\n", "from matplotlib import pylab as plt\n", - "# mel = mel_poutputs[idx].detach().cpu().numpy()\n", - "mel = postnet_outputs[idx].detach().cpu().numpy()\n", - "mel_diff2 = melt.T - mel[:melt.shape[1]]\n", + "mel_diff2 = mel_truth.T - mel_decoder\n", + "plt.figure(figsize=(16, 10))\n", + "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", + "plt.colorbar()\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# PLOT GT SPECTROGRAM diff\n", + "from matplotlib import pylab as plt\n", + "mel = postnet_outputs[idx]\n", + "mel_diff2 = mel_truth.T - mel[:mel_truth.shape[1]]\n", "plt.figure(figsize=(16, 10))\n", "plt.imshow(abs(mel_diff2).T,aspect=\"auto\", origin=\"lower\");\n", "plt.colorbar()\n", diff --git a/utils/audio.py b/utils/audio.py index 67110134..be44cc42 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -269,12 +269,11 @@ class AudioProcessor(object): y = self._istft(S_complex * angles) return y - def compute_stft_paddings(x, fsize, fshift, pad_sides=1): + def compute_stft_paddings(x, pad_sides=1): '''compute right padding (final frame) or both sides padding (first and final frames) ''' assert pad_sides in (1, 2) - # return int(fsize // 2) - pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] + pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0] if pad_sides == 1: return 0, pad else: From 6501369b0a2c32e896900d913f31f10b583045b4 Mon Sep 17 00:00:00 2001 From: mittimithai Date: Wed, 1 Apr 2020 11:57:53 -0700 Subject: [PATCH 039/104] Small fix for "Tacotron" use "Tacotron" won't work without this fix, since the linear spectrograms end up not getting computed --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 62800c7b..a4aaed17 100644 --- a/train.py +++ b/train.py @@ -47,7 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): dataset = MyDataset( r, c.text_cleaner, - compute_linear_spec=True if c.model.lower() is 'tacotron' else False, + compute_linear_spec=True if c.model in ["Tacotron"] else False, meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, tp=c.characters if 'characters' in c.keys() else None, From 3293d4e05f3a790d1e3c7011cf459d5bb541ebc7 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 2 Apr 2020 13:06:19 +0200 Subject: [PATCH 040/104] bug fix to use tacotron --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 0bd1f2af..1fc997ef 100644 --- a/train.py +++ b/train.py @@ -47,7 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): dataset = MyDataset( r, c.text_cleaner, - compute_linear_spec=True if c.model.lower() is 'tacotron' else False, + compute_linear_spec=True if c.model.lower() == 'tacotron' else False, meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, tp=c.characters if 'characters' in c.keys() else None, From 668a69576357cd0382a432d85b87ef623c7b42d3 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 9 Apr 2020 12:28:52 +0200 Subject: [PATCH 041/104] bug fixes and consider the fmin fmax plotting specs --- config.json | 2 +- utils/audio.py | 2 +- utils/visual.py | 25 +++++++++++++++---------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/config.json b/config.json index 1b497646..e19ea9de 100644 --- a/config.json +++ b/config.json @@ -36,7 +36,7 @@ "symmetric_norm": true, // move normalization to range [-1, 1] "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, // VOCABULARY PARAMETERS diff --git a/utils/audio.py b/utils/audio.py index be44cc42..27605800 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -269,7 +269,7 @@ class AudioProcessor(object): y = self._istft(S_complex * angles) return y - def compute_stft_paddings(x, pad_sides=1): + def compute_stft_paddings(self,x, pad_sides=1): '''compute right padding (final frame) or both sides padding (first and final frames) ''' assert pad_sides in (1, 2) diff --git a/utils/visual.py b/utils/visual.py index b0db7b04..56b2ac76 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -40,8 +40,8 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)): return fig -def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None, figsize=[8, 24]): - if spectrogram is not None: +def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=[8, 24]): + if decoder_output is not None: num_plot = 4 else: num_plot = 3 @@ -53,30 +53,35 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) + # compute phoneme representation and back if CONFIG.use_phonemes: seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) text = sequence_to_phoneme(seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) print(text) - plt.yticks(range(len(text)), list(text)) plt.colorbar() - + # plot stopnet predictions stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy() plt.subplot(num_plot, 1, 2) plt.plot(range(len(stop_tokens)), list(stop_tokens)) - + # plot postnet spectrogram plt.subplot(num_plot, 1, 3) - librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'], - hop_length=hop_length, x_axis="time", y_axis="linear") + librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'], + hop_length=hop_length, x_axis="time", y_axis="linear", + fmin=CONFIG.audio['mel_fmin'], + fmax=CONFIG.audio['mel_fmax']) + plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() - if spectrogram is not None: + if decoder_output is not None: plt.subplot(num_plot, 1, 4) - librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'], - hop_length=hop_length, x_axis="time", y_axis="linear") + librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'], + hop_length=hop_length, x_axis="time", y_axis="linear", + fmin=CONFIG.audio['mel_fmin'], + fmax=CONFIG.audio['mel_fmax']) plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() From 0e7ecca33f6bc2bd8a30d5d587774ea1cfce14e5 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 23 Apr 2020 14:14:09 +0200 Subject: [PATCH 042/104] fancier and more flexible (self adapting to loss_dict) console logging. Fixing multi-gpu loss reduce --- config.json | 1 + train.py | 130 +++++++++------------ utils/console_logger.py | 88 ++++++++++++++ utils/generic_utils.py | 36 +++++- utils/{logger.py => tensorboard_logger.py} | 2 +- 5 files changed, 179 insertions(+), 78 deletions(-) create mode 100644 utils/console_logger.py rename utils/{logger.py => tensorboard_logger.py} (98%) diff --git a/config.json b/config.json index e19ea9de..fa72b9ab 100644 --- a/config.json +++ b/config.json @@ -103,6 +103,7 @@ // TENSORBOARD and LOGGING "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values in evalulation. "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. diff --git a/train.py b/train.py index 1fc997ef..46e2c43f 100644 --- a/train.py +++ b/train.py @@ -20,8 +20,9 @@ from TTS.utils.generic_utils import ( get_git_branch, load_config, remove_experiment_folder, save_best_model, save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file, setup_model, gradual_training_scheduler, KeepAverage, - set_weight_decay, check_config) -from TTS.utils.logger import Logger + set_weight_decay, check_config, print_train_step) +from TTS.utils.tensorboard_logger import TensorboardLogger +from TTS.utils.console_logger import ConsoleLogger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers from TTS.utils.synthesis import synthesis @@ -125,8 +126,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, train_values = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, - 'avg_stop_loss': 0, - 'avg_align_score': 0, + 'avg_stopnet_loss': 0, + 'avg_align_error': 0, 'avg_step_time': 0, 'avg_loader_time': 0, 'avg_alignment_score': 0 @@ -138,13 +139,13 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, train_values['avg_ga_loss'] = 0 # guidede attention loss keep_avg = KeepAverage() keep_avg.add_values(train_values) - print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) if use_cuda: batch_n_iter = int( len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() + c_logger.print_train_start() for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -193,9 +194,10 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, grad_norm, grad_flag = check_update(model, c.grad_clip, ignore_stopnet=True) optimizer.step() - # compute alignment score - align_score = alignment_diagonal_score(alignments) - keep_avg.update_value('avg_align_score', align_score) + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(alignments) + keep_avg.update_value('avg_align_error', align_error) + loss_dict['align_error'] = align_error # backpass and check the grad norm for stop loss if c.separate_stopnet: @@ -209,17 +211,22 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, step_time = time.time() - start_time epoch_time += step_time + # update avg stats + update_train_values = { + 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), + 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), + 'avg_stopnet_loss': loss_dict['stopnet_loss'].item() + if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()), + 'avg_step_time': step_time, + 'avg_loader_time': loader_time + } + keep_avg.update_values(update_train_values) + if global_step % c.print_step == 0: - print( - " | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} " - "DecoderLoss:{:.5f} StopLoss:{:.5f} GALoss:{:.5f} GradNorm:{:.5f} " - "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " - "LoaderTime:{:.2f} LR:{:.6f}".format( - num_iter, batch_n_iter, global_step, loss_dict['postnet_loss'].item(), - loss_dict['decoder_loss'].item(), loss_dict['stopnet_loss'].item(), - loss_dict['ga_loss'].item(), grad_norm, grad_norm_st, avg_text_length, - avg_spec_length, step_time, loader_time, current_lr), - flush=True) + c_logger.print_train_step(batch_n_iter, num_iter, global_step, + avg_spec_length, avg_text_length, + step_time, loader_time, current_lr, + loss_dict, keep_avg.avg_values) # aggregate losses from processes if num_gpus > 1: @@ -230,16 +237,6 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, num_gpus) if c.stopnet else loss_dict['stopnet_loss'] if args.rank == 0: - update_train_values = { - 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), - 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), - 'avg_stop_loss': loss_dict['stopnet_loss'].item() - if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()), - 'avg_step_time': step_time, - 'avg_loader_time': loader_time - } - keep_avg.update_values(update_train_values) - # Plot Training Iter Stats # reduce TB load if global_step % 10 == 0: @@ -289,23 +286,16 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, end_time = time.time() # print epoch stats - print(" | > EPOCH END -- GlobalStep:{} " - "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " - "AvgStopLoss:{:.5f} AvgGALoss:{:3f} EpochTime:{:.2f} " - "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format( - global_step, keep_avg['avg_postnet_loss'], - keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'], - keep_avg['avg_ga_loss'], epoch_time, - keep_avg['avg_step_time'], keep_avg['avg_loader_time']), - flush=True) + c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) + # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], - "stop_loss": keep_avg['avg_stop_loss'], - "alignment_score": keep_avg['avg_align_score'], + "stopnet_loss": keep_avg['avg_stopnet_loss'], + "alignment_score": keep_avg['avg_align_error'], "epoch_time": epoch_time } if c.ga_alpha > 0: @@ -313,7 +303,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) - return keep_avg['avg_postnet_loss'], global_step + return keep_avg.avg_values, global_step @torch.no_grad() @@ -326,8 +316,8 @@ def evaluate(model, criterion, ap, global_step, epoch): eval_values_dict = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, - 'avg_stop_loss': 0, - 'avg_align_score': 0 + 'avg_stopnet_loss': 0, + 'avg_align_error': 0 } if c.bidirectional_decoder: eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss @@ -336,8 +326,8 @@ def evaluate(model, criterion, ap, global_step, epoch): eval_values_dict['avg_ga_loss'] = 0 # guidede attention loss keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) - print("\n > Validation") + c_logger.print_eval_start() if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -377,40 +367,27 @@ def evaluate(model, criterion, ap, global_step, epoch): epoch_time += step_time # compute alignment score - align_score = alignment_diagonal_score(alignments) - keep_avg.update_value('avg_align_score', align_score) + align_error = 1 - alignment_diagonal_score(alignments) + keep_avg.update_value('avg_align_error', align_error) # aggregate losses from processes if num_gpus > 1: - postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) - decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) + postnet_loss = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus) + decoder_loss = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus) if c.stopnet: - stop_loss = reduce_tensor(stop_loss.data, num_gpus) + stopnet_loss = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) keep_avg.update_values({ 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), - 'avg_stop_loss': + 'avg_stopnet_loss': float(loss_dict['stopnet_loss'].item()), }) - if num_iter % c.print_step == 0: - print( - " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " - "StopLoss: {:.5f} - {:.5f} GALoss: {:.5f} - {:.5f} AlignScore: {:.4f} - {:.4f}" - .format(loss_dict['loss'].item(), - loss_dict['postnet_loss'].item(), - keep_avg['avg_postnet_loss'], - loss_dict['decoder_loss'].item(), - keep_avg['avg_decoder_loss'], - loss_dict['stopnet_loss'].item(), - keep_avg['avg_stop_loss'], - loss_dict['ga_loss'].item(), - keep_avg['avg_ga_loss'], - align_score, keep_avg['avg_align_score']), - flush=True) + if c.print_eval: + c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) if args.rank == 0: # Diagnostic visualizations @@ -439,8 +416,8 @@ def evaluate(model, criterion, ap, global_step, epoch): epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], - "stop_loss": keep_avg['avg_stop_loss'], - "alignment_score": keep_avg['avg_align_score'], + "stopnet_loss": keep_avg['avg_stopnet_loss'], + "alignment_score": keep_avg['avg_align_error'], } if c.bidirectional_decoder: @@ -501,7 +478,7 @@ def evaluate(model, criterion, ap, global_step, epoch): tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg['avg_postnet_loss'] + return keep_avg.avg_values # FIXME: move args definition/parsing inside of main? @@ -603,6 +580,7 @@ def main(args): # pylint: disable=redefined-outer-name global_step = args.restore_step for epoch in range(0, c.epochs): + c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) @@ -610,18 +588,16 @@ def main(args): # pylint: disable=redefined-outer-name model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) - print(" > Number of outputs per iteration:", model.decoder.r) + print("\n > Number of output frames:", model.decoder.r) - train_loss, global_step = train(model, criterion, optimizer, + train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch) - val_loss = evaluate(model, criterion, ap, global_step, epoch) - print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( - train_loss, val_loss), - flush=True) - target_loss = train_loss + eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) + c_logger.print_epoch_end(epoch, eval_avg_loss_dict) + target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: - target_loss = val_loss + target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model(model, optimizer, target_loss, best_loss, OUT_PATH, global_step, epoch) @@ -681,6 +657,8 @@ if __name__ == '__main__': AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') + c_logger = ConsoleLogger() + if args.rank == 0: os.makedirs(AUDIO_PATH, exist_ok=True) new_fields = {} @@ -693,7 +671,7 @@ if __name__ == '__main__': os.chmod(OUT_PATH, 0o775) LOG_DIR = OUT_PATH - tb_logger = Logger(LOG_DIR) + tb_logger = TensorboardLogger(LOG_DIR) # write model desc to tensorboard tb_logger.tb_add_text('model-description', c['run_description'], 0) diff --git a/utils/console_logger.py b/utils/console_logger.py new file mode 100644 index 00000000..fad963fd --- /dev/null +++ b/utils/console_logger.py @@ -0,0 +1,88 @@ +import datetime +from TTS.utils.generic_utils import AttrDict + + +tcolors = AttrDict({ + 'OKBLUE': '\033[94m', + 'HEADER': '\033[95m', + 'OKGREEN': '\033[92m', + 'WARNING': '\033[93m', + 'FAIL': '\033[91m', + 'ENDC': '\033[0m', + 'BOLD': '\033[1m', + 'UNDERLINE': '\033[4m' +}) + + +class ConsoleLogger(): + def __init__(self): + # TODO: color code for value changes + # use these to compare values between iterations + self.old_train_loss_dict = None + self.old_epoch_loss_dict = None + self.old_eval_loss_dict = None + + def get_time(self): + now = datetime.datetime.now() + return now.strftime("%Y-%m-%d %H:%M:%S") + + def print_epoch_start(self, epoch, max_epoch): + print("\n{}{} > EPOCH: {}/{}{}".format(tcolors.UNDERLINE, tcolors.BOLD, + epoch, max_epoch, tcolors.ENDC), + flush=True) + + def print_train_start(self): + print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}") + + def print_train_step(self, batch_steps, step, global_step, avg_spec_length, + avg_text_length, step_time, loader_time, lr, + loss_dict, avg_loss_dict): + indent = " | > " + print() + log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format( + tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC) + for key, value in loss_dict.items(): + # print the avg value if given + if f'avg_{key}' in avg_loss_dict.keys(): + log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) + else: + log_text += "{}{}: {:.5f} \n".format(indent, key, value) + log_text += "{}avg_spec_len: {}\n{}avg_text_len: {}\n{}step_time: {:.2f}\n{}loader_time: {:.2f}\n{}lr: {:.5f}"\ + .format(indent, avg_spec_length, indent, avg_text_length, indent, step_time, indent, loader_time, indent, lr) + print(log_text, flush=True) + + def print_train_epoch_end(self, global_step, epoch, epoch_time, + print_dict): + indent = " | > " + log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n" + for key, value in print_dict.items(): + log_text += "{}{}: {:.5f}\n".format(indent, key, value) + print(log_text, flush=True) + + def print_eval_start(self): + print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") + + def print_eval_step(self, step, loss_dict, avg_loss_dict): + indent = " | > " + log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n" + for key, value in loss_dict.items(): + # print the avg value if given + if f'avg_{key}' in avg_loss_dict.keys(): + log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) + else: + log_text += "{}{}: {:.5f} \n".format(indent, key, value) + print(log_text, flush=True) + + def print_epoch_end(self, epoch, avg_loss_dict): + indent = " | > " + log_text = " {}--> EVAL PERFORMANCE{}\n".format( + tcolors.BOLD, tcolors.ENDC) + for key, value in avg_loss_dict.items(): + # print the avg value if given + color = tcolors.OKGREEN + if self.old_eval_loss_dict is not None: + if self.old_eval_loss_dict[key] > value: + color = tcolors.FAIL + log_text += "{}{}:{} {:.5f} \n{}".format(indent, key, color, value, tcolors.ENDC) + self.old_eval_loss_dict = avg_loss_dict + print(log_text, flush=True) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 9af0bb70..55f0b38f 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -144,7 +144,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, best_loss = model_loss bestmodel_path = 'best_model.pth.tar' bestmodel_path = os.path.join(out_path, bestmodel_path) - print("\n > BEST MODEL ({0:.5f}) : {1:}".format( + print(" > BEST MODEL ({0:.5f}) : {1:}".format( model_loss, bestmodel_path)) torch.save(state, bestmodel_path) return best_loss @@ -368,6 +368,9 @@ class KeepAverage(): def __getitem__(self, key): return self.avg_values[key] + def items(self): + return self.avg_values.items() + def add_value(self, name, init_val=0, init_iter=0): self.avg_values[name] = init_val self.iters[name] = init_iter @@ -407,6 +410,37 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' +tcolors = AttrDict({ + 'OKBLUE': '\033[94m', + 'HEADER': '\033[95m', + 'OKGREEN': '\033[92m', + 'WARNING': '\033[93m', + 'FAIL': '\033[91m', + 'ENDC': '\033[0m', + 'BOLD': '\033[1m', + 'UNDERLINE': '\033[4m' +}) + + +def print_train_step(batch_steps, step, global_step, avg_spec_length, avg_text_length, step_time, loader_time, lr, print_dict): + indent = " | > " + print() + log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC) + for key, value in print_dict.items(): + log_text += "{}{}: {:.5f}\n".format(indent, key, value) + log_text += "{}avg_spec_len: {}\n{}avg_text_len: {}\n{}step_time: {:.2f}\n{}loader_time: {:.2f}\n{}lr: {:.5f}"\ + .format(indent, avg_spec_length, indent, avg_text_length, indent, step_time, indent, loader_time, indent, lr) + print(log_text, flush=True) + + +def print_train_epoch(step, global_step, epoch, loss_dict): + pass + + +def print_eval_step(): + pass + + def check_config(c): _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) _check_argument('run_name', c, restricted=True, val_type=str) diff --git a/utils/logger.py b/utils/tensorboard_logger.py similarity index 98% rename from utils/logger.py rename to utils/tensorboard_logger.py index e5faeda4..15fe04e4 100644 --- a/utils/logger.py +++ b/utils/tensorboard_logger.py @@ -2,7 +2,7 @@ import traceback from tensorboardX import SummaryWriter -class Logger(object): +class TensorboardLogger(object): def __init__(self, log_dir): self.writer = SummaryWriter(log_dir) self.train_stats = {} From d5093bf6fb4f6f1474930a626478a126d3909f62 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 23 Apr 2020 14:24:38 +0200 Subject: [PATCH 043/104] checkpoint log --- utils/generic_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 55f0b38f..c82f34ad 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -113,7 +113,7 @@ def save_checkpoint(model, optimizer, optimizer_st, model_loss, out_path, current_step, epoch): checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step) checkpoint_path = os.path.join(out_path, checkpoint_path) - print(" | | > Checkpoint saving : {}".format(checkpoint_path)) + print(" > CHECKPOINT : {}".format(checkpoint_path)) new_state_dict = model.state_dict() state = { From 6e2c8c6537b4bf5bceadf8efb8f5fc5889e9fb97 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 23 Apr 2020 14:37:12 +0200 Subject: [PATCH 044/104] update config.json --- config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.json b/config.json index fa72b9ab..da3fe286 100644 --- a/config.json +++ b/config.json @@ -1,7 +1,7 @@ { "model": "Tacotron2", "run_name": "ljspeech", - "run_description": "tacotron2 with guided attention and -1 1 normalization and no preemphasis", + "run_description": "tacotron2", // AUDIO PARAMETERS "audio":{ From 3673cc1e30f38137afdcb1be3160ef6dff4127d7 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 23 Apr 2020 15:46:11 +0200 Subject: [PATCH 045/104] passing reduced losses to loss dict --- train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index 46e2c43f..328e81e8 100644 --- a/train.py +++ b/train.py @@ -372,10 +372,10 @@ def evaluate(model, criterion, ap, global_step, epoch): # aggregate losses from processes if num_gpus > 1: - postnet_loss = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus) - decoder_loss = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus) + loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus) + loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus) if c.stopnet: - stopnet_loss = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) + loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) keep_avg.update_values({ 'avg_postnet_loss': From f63bce89f6686d01c1c107acd9b0d1973e85b6d4 Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 23 Apr 2020 15:46:45 +0200 Subject: [PATCH 046/104] pylint fix --- datasets/TTSDataset.py | 2 +- tests/test_audio.py | 3 ++- train.py | 9 ++++----- utils/audio.py | 19 ++++++++++--------- utils/console_logger.py | 6 ++++-- utils/generic_utils.py | 16 +++++----------- utils/visual.py | 2 +- 7 files changed, 27 insertions(+), 30 deletions(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index ae75f3cf..0d884c00 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -224,7 +224,7 @@ class MyDataset(Dataset): mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) - # compute linear spectrogram + # compute linear spectrogram if self.compute_linear_spec: linear = [self.ap.spectrogram(w).astype('float32') for w in wav] linear = prepare_tensor(linear, self.outputs_per_step) diff --git a/tests/test_audio.py b/tests/test_audio.py index 9cc849b5..2ede77ce 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -13,6 +13,7 @@ os.makedirs(OUT_PATH, exist_ok=True) conf = load_config(os.path.join(TESTS_PATH, 'test_config.json')) +# pylint: disable=protected-access class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestAudio, self).__init__(*args, **kwargs) @@ -165,7 +166,7 @@ class TestAudio(unittest.TestCase): self.ap.signal_norm = False self.ap.preemphasis = 0.0 - + # test scaler forward and backward transforms wav = self.ap.load_wav(WAV_FILE) mel_reference = self.ap.melspectrogram(wav) diff --git a/train.py b/train.py index 328e81e8..94ccfedb 100644 --- a/train.py +++ b/train.py @@ -7,7 +7,6 @@ import traceback import numpy as np import torch -import torch.nn as nn from torch.utils.data import DataLoader from TTS.datasets.TTSDataset import MyDataset @@ -20,7 +19,7 @@ from TTS.utils.generic_utils import ( get_git_branch, load_config, remove_experiment_folder, save_best_model, save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file, setup_model, gradual_training_scheduler, KeepAverage, - set_weight_decay, check_config, print_train_step) + set_weight_decay, check_config) from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.console_logger import ConsoleLogger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ @@ -215,7 +214,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, update_train_values = { 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), - 'avg_stopnet_loss': loss_dict['stopnet_loss'].item() + 'avg_stopnet_loss': loss_dict['stopnet_loss'].item() \ if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()), 'avg_step_time': step_time, 'avg_loader_time': loader_time @@ -591,8 +590,8 @@ def main(args): # pylint: disable=redefined-outer-name print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, - optimizer_st, scheduler, ap, - global_step, epoch) + optimizer_st, scheduler, ap, + global_step, epoch) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] diff --git a/utils/audio.py b/utils/audio.py index 27605800..413b6163 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -72,7 +72,7 @@ class AudioProcessor(object): # setup scaler if stats_path: mel_mean, mel_std, linear_mean, linear_std, _ = self.load_stats(stats_path) - self.setup_scaler(mel_mean, mel_std, linear_mean,linear_std) + self.setup_scaler(mel_mean, mel_std, linear_mean, linear_std) self.signal_norm = True self.max_norm = None self.clip_norm = None @@ -107,7 +107,7 @@ class AudioProcessor(object): # mean-var scaling if hasattr(self, 'mel_scaler'): if S.shape[0] == self.num_mels: - return self.mel_scaler.transform(S.T).T + return self.mel_scaler.transform(S.T).T elif S.shape[0] == self.n_fft / 2: return self.linear_scaler.transform(S.T).T else: @@ -136,7 +136,7 @@ class AudioProcessor(object): # mean-var scaling if hasattr(self, 'mel_scaler'): if S_denorm.shape[0] == self.num_mels: - return self.mel_scaler.inverse_transform(S_denorm.T).T + return self.mel_scaler.inverse_transform(S_denorm.T).T elif S_denorm.shape[0] == self.n_fft / 2: return self.linear_scaler.inverse_transform(S_denorm.T).T else: @@ -168,10 +168,10 @@ class AudioProcessor(object): for key in stats_config.keys(): if key in skip_parameters: continue - assert stats_config[key] == self.__dict__[ - key], f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" + assert stats_config[key] == self.__dict__[key],\ + f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" return mel_mean, mel_std, linear_mean, linear_std, stats_config - + # pylint: disable=attribute-defined-outside-init def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std): self.mel_scaler = StandardScaler() @@ -180,9 +180,11 @@ class AudioProcessor(object): self.linear_scaler.set_stats(linear_mean, linear_std) ### DB and AMP conversion ### + # pylint: disable=no-self-use def _amp_to_db(self, x): return 20 * np.log10(np.maximum(1e-5, x)) + # pylint: disable=no-self-use def _db_to_amp(self, x): return np.power(10.0, x * 0.05) @@ -269,15 +271,14 @@ class AudioProcessor(object): y = self._istft(S_complex * angles) return y - def compute_stft_paddings(self,x, pad_sides=1): + def compute_stft_paddings(self, x, pad_sides=1): '''compute right padding (final frame) or both sides padding (first and final frames) ''' assert pad_sides in (1, 2) pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0] if pad_sides == 1: return 0, pad - else: - return pad // 2, pad // 2 + pad % 2 + return pad // 2, pad // 2 + pad % 2 ### Audio Processing ### def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): diff --git a/utils/console_logger.py b/utils/console_logger.py index fad963fd..8786120d 100644 --- a/utils/console_logger.py +++ b/utils/console_logger.py @@ -22,6 +22,7 @@ class ConsoleLogger(): self.old_epoch_loss_dict = None self.old_eval_loss_dict = None + # pylint: disable=no-self-use def get_time(self): now = datetime.datetime.now() return now.strftime("%Y-%m-%d %H:%M:%S") @@ -47,10 +48,11 @@ class ConsoleLogger(): log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) else: log_text += "{}{}: {:.5f} \n".format(indent, key, value) - log_text += "{}avg_spec_len: {}\n{}avg_text_len: {}\n{}step_time: {:.2f}\n{}loader_time: {:.2f}\n{}lr: {:.5f}"\ - .format(indent, avg_spec_length, indent, avg_text_length, indent, step_time, indent, loader_time, indent, lr) + log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\n{indent}\ + step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}" print(log_text, flush=True) + # pylint: disable=unused-argument def print_train_epoch_end(self, global_step, epoch, epoch_time, print_dict): indent = " | > " diff --git a/utils/generic_utils.py b/utils/generic_utils.py index c82f34ad..5d91d74d 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -369,7 +369,7 @@ class KeepAverage(): return self.avg_values[key] def items(self): - return self.avg_values.items() + return self.avg_values.items() def add_value(self, name, init_val=0, init_iter=0): self.avg_values[name] = init_val @@ -412,7 +412,7 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric tcolors = AttrDict({ 'OKBLUE': '\033[94m', - 'HEADER': '\033[95m', + 'HEADER': '\033[95m', 'OKGREEN': '\033[92m', 'WARNING': '\033[93m', 'FAIL': '\033[91m', @@ -428,17 +428,10 @@ def print_train_step(batch_steps, step, global_step, avg_spec_length, avg_text_l log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC) for key, value in print_dict.items(): log_text += "{}{}: {:.5f}\n".format(indent, key, value) - log_text += "{}avg_spec_len: {}\n{}avg_text_len: {}\n{}step_time: {:.2f}\n{}loader_time: {:.2f}\n{}lr: {:.5f}"\ + log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\ + \n{indent}step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}"\ .format(indent, avg_spec_length, indent, avg_text_length, indent, step_time, indent, loader_time, indent, lr) print(log_text, flush=True) - - -def print_train_epoch(step, global_step, epoch, loss_dict): - pass - - -def print_eval_step(): - pass def check_config(c): @@ -530,6 +523,7 @@ def check_config(c): _check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) # dataloading + # pylint: disable=import-outside-toplevel from TTS.utils.text import cleaners _check_argument('text_cleaner', c, restricted=True, val_type=str, enum_list=dir(cleaners)) _check_argument('enable_eos_bos_chars', c, restricted=True, val_type=bool) diff --git a/utils/visual.py b/utils/visual.py index 56b2ac76..8789cf8f 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -40,7 +40,7 @@ def plot_spectrogram(linear_output, audio, fig_size=(16, 10)): return fig -def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=[8, 24]): +def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24)): if decoder_output is not None: num_plot = 4 else: From 95385c879740aaebdb416ff7e56311b53581623a Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 23 Apr 2020 16:06:12 +0200 Subject: [PATCH 047/104] adding a dummy normalization file for testint mean-var norm --- tests/inputs/scale_stats.npy | Bin 0 -> 10479 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/inputs/scale_stats.npy diff --git a/tests/inputs/scale_stats.npy b/tests/inputs/scale_stats.npy new file mode 100644 index 0000000000000000000000000000000000000000..5368ecb25445401efae40d8dd8c640aec15fd9c4 GIT binary patch literal 10479 zcmbt)X*kv2*EdN@q?C#xvxM_K1IfG)hclVSM95U8V``9Sk|NSzC@K`qN)b|)N-B}0 zfyhuPN=ozS-u-_6|9xN2>*u^U>)LCtz1Q^FYhP!by;tDIck~h!Nf1fYS{W1@7^7vZ zthK>GOIKMdI4UMC#y`?8DrRNSU$(7(cx({i$A>(lyEl)B?STj zXPkWg`Qlo{38x0^ST2-G5=t-hoJ>R_g2MeGg8UXj8q!ml zokotRY}|cje;^OlHnc#qBM0M!LN5K0J>sr|u5(`4qvF)(>b{yBM4z%gNly+=RsFG3 z)PsWu&&h1KG! zuJcVN7h_Z_%)Mzml>Tw)Ega{ep$4LpZ`K}*ml5D<$qBDKS_RnJUh#NEivy1N zODfKlbi_LgZElV-9dSV7clC&lBg#<4TS|i+QK#29T)2kJFFn$xS>=fJubwHCjyPh| z^%av>&2hqa@Zg?^pA&95pD&Efa6&t!-u-GvoRIpWm_zGy!t7i5osox}a5(PP(r=rc z@aV?A>l?udrE>08&QW#3{)FKK&)H6xyzY+TtRIeOls5ckpx+T2H5QxxA>-5oTZWws zo$!_Zz28f=IH75l_CB*lCuGv(t&dG`MxBA3R#EcKc&aCjAP?01Zsq#)vqUv`pM=EZr0Ry zdDsc3T~l`3MAmndef^yG)Ctx0u08wuu@f3vKWXadb;7o$?h}iNo|q2}8{RNTzS(IR z``w(ebJ=*>uS93u^y++L=s{=HR9hJ6Q|*kU75mD4F4Rf|ZX8tQzEdF&?cE%o8oK|z{!h|`ln9%09DYn=J zH(#AEC-8T{KQ^bO`vop2oEs;U5Ac-u56olZRtgpVi}HDYQ9j>UsQ4e0D}~Da3)*%& z3F{*o%Fr|#{ItGB0V=z9#pJJ10fm7orNPSrc6d%74Z7-hLTgbA=ucr*tK&C0)d+GuTp3d`jIdG2#FS_@Ze%Gd;GPmTQp&X+0 zN7C7?dSF^qy63qs4WuuI>18+4;83+LEp`qCE+eV4Y04C&G&RmRTSP(BT63KiF*8DVBDp0{TcD|+vodvhaG58 zWyK3#P4e+%c(e5Tsa!6c+=reXmd z+;Uzf2EU_&_3XN>S0foPr$%LmizE{w=5879zrck0#Oc9GzsPt+$cRZj6OO#tnzm>^ z6AnGIi*z``gu#YslRh;tL2J^@OpS|7kl@BnP7Gy&a6z0Pnjf79FNah8760KAyU_OPY&&tdIEQ z7?Lu1>3kBkjE&)2^sc=YS&C)%<;OqvTH<42rOLA37TB76+c5LKDGrrZj-8h=!4dP- z$4XPwAQi@ib)pO;CR1l`$WLm_L8n21%L$|AHF4=0i5~qr`k_>OLS1wYvJ#- z#Orwvu4*r{!iu1KZw4E!aPMPg$}>4@)KqcUw@=<0LoOLPy0}{7hXV@xyEClOa;tqp z^)YK);nHy~>!vlfwB$^i_|O{tDv#89ey~QFlHMj9x5j5bL&g)Vmg3Fm&8ry&OVMpp zScT`nQjDB*edZ-u8{EEdWOtIsIbKHYtgVq`=92K_PV5+opy6~+H3W6UzzB|hn zE97f;UFO*0v$kbXEgNl7ZMgNwht0N_lIm|FR%na*)gRZV*VS?63VvoXLHJ*EB4+w^Qv%e7+5CmCD3bS_VI zW}6MxF8{rB);SxT&1gzfJ!^x~O1|J$ZiC+#GD&le+u-J5Z~u;b8+?0&^}8d*1~{(Y!=r6c&w13f{-7=9Wd`-DSJ>kGvgj{;FKp3b!eHOc$!y$VJ)t66 zjg1F)i-=j+vGLTs@JD0eY@8ZxGWI#1jdQXz)@$r$BftFaeEv~3PBIPLo?OnxW{ugl zQDtn*l}+hZIl)Hl9r6!LirIMj#D3A&d2IZBx9XPg02|*QSVqfBXXCu&XLU(?*m!t* zQhief8=tYIY8XWBOJ%^BeH+<0q4(?3D+z22)KRR%05*3GDMghWHp(BN z;x8Go(XU$Un+}VOw@&7J{aD1ttCmBjF3w?NW|4mGR53O#oGgAMQk0Dfa|}<_zOu#e zdL^4Fw`|e-&;fsk^R|e4g}M5rwpgqz6-G<6Me0ozt2E3O->mY4LzSJRU)=DxNFvBfJslU^9CvBf8Yhs?|f?gl@VUKLBX zMVFY;^dltRA1sZ1qN;82zJavE=6YM~N|Q4?m@`q#!Le`o2u*{<@HjE2Yv~TJ+i2cK{!%EM?=-6JyWzk@|G;%cDgD z-fX;LQuQG{jE#?+9Ws)G*;psnI7cL!jic9k*Sds|xP`9SzJ4_uxBt3$&p(2V(!1%G zYY0tNdSCBP{QLgL%Y zYHo);X&1Fad3Ii?rJL#6e#Q>J$G1b+Sv!<2Y~Y*L5WU&f z`)se+;d!6&Q#!YZ-m|&0c=zmZX!Q;DaH}2OJC>%G(`|=~&(A-w?TH=E-!41yYS0c{ zGSBHB>$gMM1)0&ZAM8-da>=?t!Y^*s9y|Ai$bAT#S^L8deO7%)J0oR}8_Ey8wU}j( zlAlE?5;g5H)+-nHnb>1vMp{6gvpp8jEw}BCu}7n`Vbkij+hfsd%dS(U_BdLm@#R33 zJ^FQ;pJa5}qxUnD6ZCiXc;!#QXx=OiwuibLm_TuGp5xb!^kp2}`cyHzFOGvtRQRKt zk8)6+3Iqm96X#~C%0dUiyHCYmh4;1#lf4Omu7f!ai>VDio`}PKA%*_ z?kweExVO#YyA53Qc_-I!ahQwxvuDLH#<}RE8hNO3ArH55+djIQ^Kkgx$^vJ99&Ssk z8Qv4l!+nb$cX({!;Z=n;XZPJayk5%Qrcuhny1M+yEtNb}KF;>1-{Ijm^EWU3I(g`1 zFojY5mWK?XOy_w?K91c#mw7>*k8CiP^Igivu&YA_O3V4Ea94NX=2$*5Ev6V%rSh>L zD=Rg=fR80PGn<|i^Kll#{&M~)J{lP&IJcGY5lpVAO+C#={qQ5IVq`wY_FZXQDIWv& zn7+v^;J?;Jihy-f*7AbQ$2 zl+e=l^3hW!OF1;3k8VlF{j&G*(fqOhG|>ZmJT>J*Rc1CH-)hHbtjgkJlxg$Z7i-Aa zW`6a=G(KkMSqnCA;p4Ui{S-5ikDINl66@CUvFDgl0~O3isoOtasR!^;UL(zFl`C2I zi*gtNJ}y^mzM;zEW8AICwap|h?$0mmI4MKyNOqN}n83%6^EmreM|k)#!CB+-LmoO$ z?)R&^&cmZc>W-qPdAN7@bx(gD5A$qBCTFDbFz3p7kF$O}bk(u=Q18ma4ByL<`NllF zo%l&nOofMXw{AC<&gS81lbHrx6NsEb-nV1#xY+4&cjLA$E^=}h>uqYe=(bzNfKklF zCl4dzMD~;UXm{Jk9$eg?sehfmoQwT$y6eOpxoDxF=WAfZ#kH0gS*628oM5uz`dlvF zEzZ8@K7otovl6F&ddI=8;FHmUJ`TRGALw?!!$IjTt?}!ZIT*a@yM)0B4!$frO>ZjX z;2+*Wh5Sqo&VOsgzP^cr6Q4W|#su;+e0Qr+>M9Pd;;!d)l5^l*^zh_&cMf*z9+&Cl zbFie%e};=a2V)q#Q(hKCp7(TBvKa?&vt6Tf4LG=Mg`LD1ItTAqoO*phhl54aG#azC zIoNq)Y+9r)2b~A(xQ9raK&V(AVxh;u(fc>|z0%-d?ZB3$XNX+Gb%`%LGT$e6*(#dG z!H27&7EUDm59uwBg@oQT8F(SOh=c22XtJ*?BaB=$`Q4W5LeC3Wqu3t8`j44X$PR!hOgVT99V%FInpvFW02JQNQoenh?J z;eogAeXf6a__|nhgA1u=!>SVM-)8YK@HgYf4plxjxV8#Yh{%t?KU4TNiXl&+eX??vc8s^`uM1>)^OJL zB_A(F{3%iUz{kBu%4Zd2L3t0M{gb z$=Dbtz+bpxe(nYVQrYH(+Y$xn!>6;nxDZG#ryfaZuP7rT|boE$M{ zct=;&I!C-}5}Pbq;)r|pm}vF4JEFhU__i!jCu}HEY*8j{w>k8ysedA!P;Y0(nl~q% zu(g17&+Uy9S}$O!_|l!RAjZ6Zbe%K0I`&k5xaN$^TUU24ly$)g&%b2%llGKU*_V&6 zF1TR(jfu0}rnsVcX8FkR6|T7J_s%$#8dprZu&7R5)(zdo#&udF+|Y&|B@%bb4S#-@ zy6d9qj)U{7)05Y^;~z8s8)D7wcxzm1s+EifZffKVj6L>1p=O*=E5K9o|2hbNJ6{X` z+d$C%+dx?4EY$gL13~w{4FsR^{DPrE9bka{tlSss@b13LjIS(JxT(A6S)!f_T(V>t z*jX(Ak-631*E+$s2az^Ztj2OV;rwi(lQQxJaN(>j+; zhpTmEuX9MN=INy;RqMs+K>Jw~BGyR3mI|$!6ZGahrnXx0fF;8liwU81ZJi z3LS=6DOZ)H=pd=vk|;Kb4&g_Ym-HW|U_y(2TV*;0LA~C(YTgvgn<4OSnMXm^vAF$y z8bm%IOVM!=9ZD^q)jMd>L3vZ5^5p##q;h6-{2Zg;#)txamnI#qN9!i2%hTcF)a2s# zI&>JEBpdQcnhw(@>Fj?ZNe82zmG^WuP*9vL`h9;O1zy6v$L|uqHs50iey*WF`I|Gf z--Xzd{dCEr5DMBJD$gA|LxbaDzO?R*H2C77E~}kL!Hb`{hUUx3`W>dgEv8_0miQV5 zX=$zTnwgqkNC7`(k}lk*;FItAAqMfQ-jjNCnCOvL`S^WLDh1^`^UkglQXqDs_e`o4 z1(P4IntsiRf~I~0xc7ht<Fypz{*M5Ydo5?T5tXNr4mX50}F#QiEC+K)H$|@6-k4W>A6<4 zBpUdb#sAtx;$GGv%M--Wp!V2M-IX;o*eY-1thJ8@I_ayg#hs*qFgeEPVKWVW|B(B- zP?iEAXKBf976lTuGEXiO9ACY4;lSYv3Nrk~qvXDkIGH#{^5#N7xHz|(4tI|vYuS)lGG3 zaIUbvCxJ^rXhqrd zC&m=yXW2~qzJ!9f?7MI5NFKHzi z>ZQOs{Z;7nhZJOvPJKIt;PbxfqpKQUi9KJvX{i%PelAVz_9N8lu>5v-PJC{f1(c{<}kBM}s6X_KnpGo5Ldsd{!^24~Qn3zn=u(osmPqn^ z_R>wm(jYqADXhO3LTFWYuW#fUI!LShUQN%UL#?x;aaA@QmZ+??Xf7dndfgGFaDfhq zFV^arlRUo={~&FDl@5zmEs;~aPU?91?@ym@(Lr`Nks)=P4riXX6fEo{c9x|%xOUKC z-KrP5SDujiFZdbxk&MR=ru60X5u9ybP@F>UgOUd|qZ`QlmzR^T+4m5?XX{!Kef6c`YzBCoPdcetz<>u9 zOYz=W223m0y5d;P0JY`mpwi5M$+t&W-fv(4KZi}HIvCKYQT4f|o!Hj{;iDrAu=>`O z)BA=2hcYTVi^Q0)Q`^-+VG0vwJFavUkz)d7Y&GlR0w!ENzu|2HjR|t&Z)3KaF~KoQ z*q6Xzf|1z0!j9!k==r04$}EZr{Wn+B7~7d}rvB=d!9pgK$UDxft6{>~kDSqpRwnRv zJqr`OXTnqa4=4B^nXsCD`BvW)7RY^!jU4;Q1poU#Uq6;&!P4Lp_b-XFz%pFZK58Zl z`pz~er^~V6hNsKjbR`zNT5BF!tH*+0r<{Uruvy^MX)|w9C<|=D8}~;hvfxSQ_J)tA zSs-Cy6@1|a3wV8o{ykq=(0@67Nu;CRN%~8>ZGYVdte<|t1jT}o=CPN=qKPkK~ zu}B|^j>?fnus$sD*feK`zCPR@8fbk<=$6}?=E@l8!%MSgS1O6T!lw0y9y0a8?9ViH zX&rsYy5?OpsHYG0HkNGDh5GPiQ=@jahCaj!f=;&1(Ff<6jEpixeOULUIB2_!K1`o8 zZQkU$`mmJUzuItuKAcswDxEgOfuRfW(tBC(O?hvUHlcL- z$or&cENHW4eRJt&!H+*PceY()fppf|=*l}R(EmJJ@!drh#OoRPEU0CH*4_!5pIu#s{malcxt!*q=X4I>hPWB)0xU2NGh6NcX zP49XTJvKV}($UBQpP`W#yDzgqvt4XVlH`*ed77JWlLeLIEB;j6VnLqIUeB9^?=Sqp zy9alvuRD>=0=MZz~s(5v}DhocoVSTzihXoZS6GcR4vEXHyX!0K! zVrPcLnVyLxzq8`&eMN{}0+GYLADFQ4*hqTs2>HuV&!BmBAE^hnjz(9XFk$PjI9gIW z6TWU0`4W4F2`cBTrztd%za-yDmjAlUgt7izZVt6f5cMtRO+HWLzkPbxc8UovJbR6A zl`-L1`H7(=#Y|8<>Xmba)RDENGdGCmF=4~04fy&1sWS(@RQO~w;fwjMwX{7JL{s!x*pHPPF7^RrUNZaXIcz{vv&Uc6i@jLf1G?30$&^ z3GXi4Eiues0v9+X<=dEGahnbm1!t^zj3Bx+XuZrZ+`GgLYW^JxEU_xQTxzpc? zeJTC#hT!G-!K^C(y~IB!O~Y)0Bb!Bo zoPm6Tw*2FW9v!WlRSjp@k~g(yHTTu_;)FF zed5|^Ca6a&_+&=>zVssN7t4>}I9rzc(}4+jbm`XpBwl?3Li8qi&`GY}KXon>Ed#l6yS1o4gZ&i`J31mJMh_{h^y>%_a-M)XmG|^j1w^ zKJeTfenlOc_U*QwDXIqf^Ck>E-=_jG8_K9t`5GY9ixbiUJf%nj!9RXwSd<@mq81z$ zBBYXp^nYKCk;iO)!7)KXA%i?Y_@~pbCM?n~JSZ|GE>y_mi%k577aA4)UmlAuBI+qa zHVlsOj|ln?*DoSgs4wuG_P^}d(6HdRzgz>sKMx>c{Uf5ogZyIr!Fr#CF zf+C_r{bR#og@$ah_Mgu`*2V+{ldTiTleU!s!X?~U|7ctpdZ!L=Y7e6BQB_PA&D1^QU+mm*}9Nz)(E~jX}i*`jhDV Uvx*iOy-sKqAMYP0v<{&C58mfZQvd(} literal 0 HcmV?d00001 From 373a682c08f739b68989d3bd67c3f75c6f8bbbe7 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 24 Apr 2020 18:51:15 +0200 Subject: [PATCH 048/104] logging fix --- utils/console_logger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/console_logger.py b/utils/console_logger.py index 8786120d..789dc780 100644 --- a/utils/console_logger.py +++ b/utils/console_logger.py @@ -48,8 +48,8 @@ class ConsoleLogger(): log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) else: log_text += "{}{}: {:.5f} \n".format(indent, key, value) - log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\n{indent}\ - step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}" + log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\n{indent}"\ + f"step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}" print(log_text, flush=True) # pylint: disable=unused-argument From 61a1d59ac5fe693320360cb7bf913d87e70a87bf Mon Sep 17 00:00:00 2001 From: PNRxA Date: Sat, 25 Apr 2020 16:30:19 +1000 Subject: [PATCH 049/104] numpy to use CPU when using CUDA --- utils/generic_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 5d91d74d..435d2b10 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -157,7 +157,7 @@ def check_update(model, grad_clip, ignore_stopnet=False): grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip) else: grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) - if np.isinf(grad_norm): + if np.isinf(grad_norm.cpu()): print(" | > Gradient is INF !!") skip_flag = True return grad_norm, skip_flag From 091711459d12492d381357eec144f28c9ff4dce7 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 29 Apr 2020 11:58:26 +0200 Subject: [PATCH 050/104] remove redundant avg keeper --- train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/train.py b/train.py index 94ccfedb..33a274dd 100644 --- a/train.py +++ b/train.py @@ -128,8 +128,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, 'avg_stopnet_loss': 0, 'avg_align_error': 0, 'avg_step_time': 0, - 'avg_loader_time': 0, - 'avg_alignment_score': 0 + 'avg_loader_time': 0 } if c.bidirectional_decoder: train_values['avg_decoder_b_loss'] = 0 # decoder backward loss From 67420eeb86c63e611bfb97c843f49388781db537 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 29 Apr 2020 11:58:51 +0200 Subject: [PATCH 051/104] more console printing formatting --- train.py | 2 +- utils/console_logger.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index 33a274dd..84648636 100644 --- a/train.py +++ b/train.py @@ -586,7 +586,7 @@ def main(args): # pylint: disable=redefined-outer-name model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) - print("\n > Number of output frames:", model.decoder.r) + print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, diff --git a/utils/console_logger.py b/utils/console_logger.py index 789dc780..ec3f50c3 100644 --- a/utils/console_logger.py +++ b/utils/console_logger.py @@ -62,10 +62,11 @@ class ConsoleLogger(): print(log_text, flush=True) def print_eval_start(self): - print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") + print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}") def print_eval_step(self, step, loss_dict, avg_loss_dict): indent = " | > " + print() log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n" for key, value in loss_dict.items(): # print the avg value if given @@ -77,7 +78,7 @@ class ConsoleLogger(): def print_epoch_end(self, epoch, avg_loss_dict): indent = " | > " - log_text = " {}--> EVAL PERFORMANCE{}\n".format( + log_text = "\n {}--> EVAL PERFORMANCE{}\n".format( tcolors.BOLD, tcolors.ENDC) for key, value in avg_loss_dict.items(): # print the avg value if given From e4e29f716e5bc2049d3316896401fae64933a716 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 4 May 2020 17:39:35 -0300 Subject: [PATCH 052/104] fix bug in bidirectional decoder train --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 94ccfedb..1444e103 100644 --- a/train.py +++ b/train.py @@ -356,7 +356,7 @@ def evaluate(model, criterion, ap, global_step, epoch): mel_lengths, decoder_backward_output, alignments, alignment_lengths, text_lengths) if c.bidirectional_decoder: - keep_avg.update_values({'avg_decoder_b_loss': loss_dict['decoder_backward_loss'].item(), + keep_avg.update_values({'avg_decoder_b_loss': loss_dict['decoder_b_loss'].item(), 'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item()}) if c.ga_alpha > 0: keep_avg.update_values({'avg_ga_loss': loss_dict['ga_loss'].item()}) From cce13ee245fab03717d2afa1860e35986ebd9de9 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 4 May 2020 17:52:58 -0300 Subject: [PATCH 053/104] Fix bug in Graves Attn On my machine at Graves attention the variable self.J ( self.J = torch.arange(0, inputs.shape[1]+2).to(inputs.device) + 0.5) is a LongTensor, but it must be a float tensor. So I get the following error: Traceback (most recent call last): File "train.py", line 704, in main(args) File "train.py", line 619, in main global_step, epoch) File "train.py", line 170, in train text_input, text_lengths, mel_input, speaker_embeddings=speaker_embeddings) File "/home/edresson/anaconda3/envs/TTS2/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ result = self.forward(*input, **kwargs) File "/mnt/edresson/DD/TTS/voice-clonning/TTS/tts_namespace/TTS/models/tacotron.py", line 121, in forward self.speaker_embeddings_projected) File "/home/edresson/anaconda3/envs/TTS2/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ result = self.forward(*input, **kwargs) File "/mnt/edresson/DD/TTS/voice-clonning/TTS/tts_namespace/TTS/layers/tacotron.py", line 435, in forward output, stop_token, attention = self.decode(inputs, mask) File "/mnt/edresson/DD/TTS/voice-clonning/TTS/tts_namespace/TTS/layers/tacotron.py", line 367, in decode self.attention_rnn_hidden, inputs, self.processed_inputs, mask) File "/home/edresson/anaconda3/envs/TTS2/lib/python3.6/site-packages/torch/nn/modules/module.py", line 489, in __call__ result = self.forward(*input, **kwargs) File "/mnt/edresson/DD/TTS/voice-clonning/TTS/tts_namespace/TTS/layers/common_layers.py", line 180, in forward phi_t = g_t.unsqueeze(-1) * (1.0 / (1.0 + torch.sigmoid((mu_t.unsqueeze(-1) - j) / sig_t.unsqueeze(-1)))) RuntimeError: expected type torch.cuda.FloatTensor but got torch.cuda.LongTensor In addition the + 0.5 operation is canceled if it is a LongTensor. Test: >>> torch.arange(0, 10) tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> torch.arange(0, 10) + 0.5 tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> torch.arange(0, 10.0) + 0.5 tensor([0.5000, 1.5000, 2.5000, 3.5000, 4.5000, 5.5000, 6.5000, 7.5000, 8.5000, 9.5000]) To resolve this I forced the arrange range to float: self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5 --- layers/common_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index 8b7ed125..78fa8b1c 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -138,7 +138,7 @@ class GravesAttention(nn.Module): def init_states(self, inputs): if self.J is None or inputs.shape[1]+1 > self.J.shape[-1]: - self.J = torch.arange(0, inputs.shape[1]+2).to(inputs.device) + 0.5 + self.J = torch.arange(0, inputs.shape[1]+2.0).to(inputs.device) + 0.5 self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device) self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device) From 2d9dcd60ba163d262206c1c6912b905b535ae197 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 13:46:32 +0200 Subject: [PATCH 054/104] update imports for util refactoring --- distribute.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/distribute.py b/distribute.py index a5fdb373..873d8aba 100644 --- a/distribute.py +++ b/distribute.py @@ -9,7 +9,8 @@ import torch.distributed as dist from torch.utils.data.sampler import Sampler from torch.autograd import Variable from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from TTS.utils.generic_utils import load_config, create_experiment_folder +from TTS.utils.io import load_config +from TTS.utils.generic_utils import create_experiment_folder class DistributedSampler(Sampler): From c0c3c6e3311a0e23ef3eeb92afff920a3b7be45e Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 13:46:58 +0200 Subject: [PATCH 055/104] train.py update imports for utils refactoring --- train.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/train.py b/train.py index 84648636..3eec0107 100644 --- a/train.py +++ b/train.py @@ -14,12 +14,13 @@ from distribute import (DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor) from TTS.layers.losses import TacotronLoss from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import ( - NoamLR, check_update, count_parameters, create_experiment_folder, - get_git_branch, load_config, remove_experiment_folder, save_best_model, - save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file, - setup_model, gradual_training_scheduler, KeepAverage, - set_weight_decay, check_config) +from TTS.utils.generic_utils import (count_parameters, create_experiment_folder, remove_experiment_folder, + get_git_branch, set_init_dict, + setup_model, KeepAverage, check_config) +from TTS.utils.io import (save_best_model, save_checkpoint, + load_config, copy_config_file) +from TTS.utils.training import (NoamLR, check_update, adam_weight_decay, + gradual_training_scheduler, set_weight_decay) from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.console_logger import ConsoleLogger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ @@ -251,9 +252,9 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, optimizer, optimizer_st, - loss_dict['postnet_loss'].item(), OUT_PATH, global_step, - epoch) + save_checkpoint(model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH, + optimizer_st=optimizer_st, + model_loss=loss_dict['postnet_loss'].item()) # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() @@ -596,8 +597,8 @@ def main(args): # pylint: disable=redefined-outer-name target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] - best_loss = save_best_model(model, optimizer, target_loss, best_loss, - OUT_PATH, global_step, epoch) + best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, + OUT_PATH) if __name__ == '__main__': From 720c4690db4f98039e25b3d636ca64ccd826eea2 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 13:47:40 +0200 Subject: [PATCH 056/104] update imports --- utils/console_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/console_logger.py b/utils/console_logger.py index ec3f50c3..3ca29f96 100644 --- a/utils/console_logger.py +++ b/utils/console_logger.py @@ -1,5 +1,5 @@ import datetime -from TTS.utils.generic_utils import AttrDict +from TTS.utils.io import AttrDict tcolors = AttrDict({ From 574968b2498e1e86590597d65142f18f58d3e477 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 13:48:51 +0200 Subject: [PATCH 057/104] refactoring utils --- utils/generic_utils.py | 251 ++++------------------------------------- utils/io.py | 78 +++++++++++++ utils/training.py | 90 +++++++++++++++ 3 files changed, 192 insertions(+), 227 deletions(-) create mode 100644 utils/io.py create mode 100644 utils/training.py diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 5d91d74d..1a621744 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -1,31 +1,11 @@ import os -import re import glob +import torch import shutil import datetime -import json -import torch import subprocess import importlib import numpy as np -from collections import OrderedDict, Counter - - -class AttrDict(dict): - def __init__(self, *args, **kwargs): - super(AttrDict, self).__init__(*args, **kwargs) - self.__dict__ = self - - -def load_config(config_path): - config = AttrDict() - with open(config_path, "r") as f: - input_str = f.read() - input_str = re.sub(r'\\\n', '', input_str) - input_str = re.sub(r'//.*\n', '\n', input_str) - data = json.loads(input_str) - config.update(data) - return config def get_git_branch(): @@ -83,155 +63,34 @@ def remove_experiment_folder(experiment_path): print(" ! Run is kept in {}".format(experiment_path)) -def copy_config_file(config_file, out_path, new_fields): - config_lines = open(config_file, "r").readlines() - # add extra information fields - for key, value in new_fields.items(): - if type(value) == str: - new_line = '"{}":"{}",\n'.format(key, value) - else: - new_line = '"{}":{},\n'.format(key, value) - config_lines.insert(1, new_line) - config_out_file = open(out_path, "w") - config_out_file.writelines(config_lines) - config_out_file.close() - - -def _trim_model_state_dict(state_dict): - r"""Remove 'module.' prefix from state dictionary. It is necessary as it - is loded for the next time by model.load_state(). Otherwise, it complains - about the torch.DataParallel()""" - - new_state_dict = OrderedDict() - for k, v in state_dict.items(): - name = k[7:] # remove `module.` - new_state_dict[name] = v - return new_state_dict - - -def save_checkpoint(model, optimizer, optimizer_st, model_loss, out_path, - current_step, epoch): - checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step) - checkpoint_path = os.path.join(out_path, checkpoint_path) - print(" > CHECKPOINT : {}".format(checkpoint_path)) - - new_state_dict = model.state_dict() - state = { - 'model': new_state_dict, - 'optimizer': optimizer.state_dict() if optimizer is not None else None, - 'step': current_step, - 'epoch': epoch, - 'linear_loss': model_loss, - 'date': datetime.date.today().strftime("%B %d, %Y"), - 'r': model.decoder.r - } - torch.save(state, checkpoint_path) - - -def save_best_model(model, optimizer, model_loss, best_loss, out_path, - current_step, epoch): - if model_loss < best_loss: - new_state_dict = model.state_dict() - state = { - 'model': new_state_dict, - 'optimizer': optimizer.state_dict(), - 'step': current_step, - 'epoch': epoch, - 'linear_loss': model_loss, - 'date': datetime.date.today().strftime("%B %d, %Y"), - 'r': model.decoder.r - } - best_loss = model_loss - bestmodel_path = 'best_model.pth.tar' - bestmodel_path = os.path.join(out_path, bestmodel_path) - print(" > BEST MODEL ({0:.5f}) : {1:}".format( - model_loss, bestmodel_path)) - torch.save(state, bestmodel_path) - return best_loss - - -def check_update(model, grad_clip, ignore_stopnet=False): - r'''Check model gradient against unexpected jumps and failures''' - skip_flag = False - if ignore_stopnet: - grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip) - else: - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) - if np.isinf(grad_norm): - print(" | > Gradient is INF !!") - skip_flag = True - return grad_norm, skip_flag - - -def lr_decay(init_lr, global_step, warmup_steps): - r'''from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py''' - warmup_steps = float(warmup_steps) - step = global_step + 1. - lr = init_lr * warmup_steps**0.5 * np.minimum(step * warmup_steps**-1.5, - step**-0.5) - return lr - - -def adam_weight_decay(optimizer): - """ - Custom weight decay operation, not effecting grad values. - """ - for group in optimizer.param_groups: - for param in group['params']: - current_lr = group['lr'] - weight_decay = group['weight_decay'] - param.data = param.data.add(-weight_decay * group['lr'], - param.data) - return optimizer, current_lr - -# pylint: disable=dangerous-default-value -def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): - """ - Skip biases, BatchNorm parameters, rnns. - and attention projection layer v - """ - decay = [] - no_decay = [] - for name, param in model.named_parameters(): - if not param.requires_grad: - continue - - if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]): - no_decay.append(param) - else: - decay.append(param) - return [{ - 'params': no_decay, - 'weight_decay': 0. - }, { - 'params': decay, - 'weight_decay': weight_decay - }] - - -class NoamLR(torch.optim.lr_scheduler._LRScheduler): - def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): - self.warmup_steps = float(warmup_steps) - super(NoamLR, self).__init__(optimizer, last_epoch) - - def get_lr(self): - step = max(self.last_epoch, 1) - return [ - base_lr * self.warmup_steps**0.5 * - min(step * self.warmup_steps**-1.5, step**-0.5) - for base_lr in self.base_lrs - ] - - -def mk_decay(init_mk, max_epoch, n_epoch): - return init_mk * ((max_epoch - n_epoch) / max_epoch) - - def count_parameters(model): r"""Count number of trainable parameters in a network""" return sum(p.numel() for p in model.parameters() if p.requires_grad) +def split_dataset(items): + is_multi_speaker = False + speakers = [item[-1] for item in items] + is_multi_speaker = len(set(speakers)) > 1 + eval_split_size = 500 if len(items) * 0.01 > 500 else int( + len(items) * 0.01) + np.random.seed(0) + np.random.shuffle(items) + if is_multi_speaker: + items_eval = [] + # most stupid code ever -- Fix it ! + while len(items_eval) < eval_split_size: + speakers = [item[-1] for item in items] + speaker_counter = Counter(speakers) + item_idx = np.random.randint(0, len(items)) + if speaker_counter[items[item_idx][-1]] > 1: + items_eval.append(items[item_idx]) + del items[item_idx] + return items_eval, items + else: + return items[:eval_split_size], items[eval_split_size:] + + # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 def sequence_mask(sequence_length, max_len=None): if max_len is None: @@ -322,44 +181,6 @@ def setup_model(num_chars, num_speakers, c): bidirectional_decoder=c.bidirectional_decoder) return model - -def split_dataset(items): - is_multi_speaker = False - speakers = [item[-1] for item in items] - is_multi_speaker = len(set(speakers)) > 1 - eval_split_size = 500 if len(items) * 0.01 > 500 else int( - len(items) * 0.01) - np.random.seed(0) - np.random.shuffle(items) - if is_multi_speaker: - items_eval = [] - # most stupid code ever -- Fix it ! - while len(items_eval) < eval_split_size: - speakers = [item[-1] for item in items] - speaker_counter = Counter(speakers) - item_idx = np.random.randint(0, len(items)) - if speaker_counter[items[item_idx][-1]] > 1: - items_eval.append(items[item_idx]) - del items[item_idx] - return items_eval, items - else: - return items[:eval_split_size], items[eval_split_size:] - - -def gradual_training_scheduler(global_step, config): - """Setup the gradual training schedule wrt number - of active GPUs""" - num_gpus = torch.cuda.device_count() - if num_gpus == 0: - num_gpus = 1 - new_values = None - # we set the scheduling wrt num_gpus - for values in config.gradual_training: - if global_step * num_gpus >= values[0]: - new_values = values - return new_values[1], new_values[2] - - class KeepAverage(): def __init__(self): self.avg_values = {} @@ -410,30 +231,6 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' -tcolors = AttrDict({ - 'OKBLUE': '\033[94m', - 'HEADER': '\033[95m', - 'OKGREEN': '\033[92m', - 'WARNING': '\033[93m', - 'FAIL': '\033[91m', - 'ENDC': '\033[0m', - 'BOLD': '\033[1m', - 'UNDERLINE': '\033[4m' -}) - - -def print_train_step(batch_steps, step, global_step, avg_spec_length, avg_text_length, step_time, loader_time, lr, print_dict): - indent = " | > " - print() - log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC) - for key, value in print_dict.items(): - log_text += "{}{}: {:.5f}\n".format(indent, key, value) - log_text += f"{indent}avg_spec_len: {avg_spec_length}\n{indent}avg_text_len: {avg_text_length}\ - \n{indent}step_time: {step_time:.2f}\n{indent}loader_time: {loader_time:.2f}\n{indent}lr: {lr:.5f}"\ - .format(indent, avg_spec_length, indent, avg_text_length, indent, step_time, indent, loader_time, indent, lr) - print(log_text, flush=True) - - def check_config(c): _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) _check_argument('run_name', c, restricted=True, val_type=str) diff --git a/utils/io.py b/utils/io.py new file mode 100644 index 00000000..9161d6fd --- /dev/null +++ b/utils/io.py @@ -0,0 +1,78 @@ +import os +import json +import re +import torch +import datetime + + +class AttrDict(dict): + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def load_config(config_path): + config = AttrDict() + with open(config_path, "r") as f: + input_str = f.read() + input_str = re.sub(r'\\\n', '', input_str) + input_str = re.sub(r'//.*\n', '\n', input_str) + data = json.loads(input_str) + config.update(data) + return config + + +def copy_config_file(config_file, out_path, new_fields): + config_lines = open(config_file, "r").readlines() + # add extra information fields + for key, value in new_fields.items(): + if type(value) == str: + new_line = '"{}":"{}",\n'.format(key, value) + else: + new_line = '"{}":{},\n'.format(key, value) + config_lines.insert(1, new_line) + config_out_file = open(out_path, "w") + config_out_file.writelines(config_lines) + config_out_file.close() + + +def load_checkpoint(model, checkpoint_path, use_cuda=False): + state = torch.load(checkpoint_path, map_location=torch.device('cpu')) + model.load_state_dict(state['model']) + if use_cuda: + model.cuda() + # set model stepsize + if 'r' in state.keys(): + model.decoder.set_r(state['r']) + return model, state + + +def save_model(model, optimizer, current_step, epoch, r, output_folder, file_name, **kwargs): + checkpoint_path = os.path.join(output_folder, file_name) + + new_state_dict = model.state_dict() + state = { + 'model': new_state_dict, + 'optimizer': optimizer.state_dict() if optimizer is not None else None, + 'step': current_step, + 'epoch': epoch, + 'date': datetime.date.today().strftime("%B %d, %Y"), + 'r': model.decoder.r + } + state.update(kwargs) + torch.save(state, checkpoint_path) + + +def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs): + print(" > CHECKPOINT : {}".format(checkpoint_path)) + file_name = 'checkpoint_{}.pth.tar'.format(current_step) + save_model(model, optimizer, current_step, epoch ,r, output_folder, file_name, **kwargs) + + +def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs): + if target_loss < best_loss: + print(" > BEST MODEL : {}".format(checkpoint_path)) + file_name = 'best_model.pth.tar' + save_model(model, optimizer, current_step, epoch ,r, output_folder, file_name, model_loss=target_loss) + best_loss = target_loss + return best_loss \ No newline at end of file diff --git a/utils/training.py b/utils/training.py new file mode 100644 index 00000000..bd314bc9 --- /dev/null +++ b/utils/training.py @@ -0,0 +1,90 @@ +import torch +import numpy as np + + +def check_update(model, grad_clip, ignore_stopnet=False): + r'''Check model gradient against unexpected jumps and failures''' + skip_flag = False + if ignore_stopnet: + grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip) + else: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) + if torch.isinf(grad_norm): + print(" | > Gradient is INF !!") + skip_flag = True + return grad_norm, skip_flag + + +def lr_decay(init_lr, global_step, warmup_steps): + r'''from https://github.com/r9y9/tacotron_pytorch/blob/master/train.py''' + warmup_steps = float(warmup_steps) + step = global_step + 1. + lr = init_lr * warmup_steps**0.5 * np.minimum(step * warmup_steps**-1.5, + step**-0.5) + return lr + + +def adam_weight_decay(optimizer): + """ + Custom weight decay operation, not effecting grad values. + """ + for group in optimizer.param_groups: + for param in group['params']: + current_lr = group['lr'] + weight_decay = group['weight_decay'] + factor = -weight_decay * group['lr'] + param.data = param.data.add(param.data, + alpha=factor) + return optimizer, current_lr + +# pylint: disable=dangerous-default-value +def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn", "lstm", "gru", "embedding"}): + """ + Skip biases, BatchNorm parameters, rnns. + and attention projection layer v + """ + decay = [] + no_decay = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + + if len(param.shape) == 1 or any([skip_name in name for skip_name in skip_list]): + no_decay.append(param) + else: + decay.append(param) + return [{ + 'params': no_decay, + 'weight_decay': 0. + }, { + 'params': decay, + 'weight_decay': weight_decay + }] + + +class NoamLR(torch.optim.lr_scheduler._LRScheduler): + def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): + self.warmup_steps = float(warmup_steps) + super(NoamLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + step = max(self.last_epoch, 1) + return [ + base_lr * self.warmup_steps**0.5 * + min(step * self.warmup_steps**-1.5, step**-0.5) + for base_lr in self.base_lrs + ] + + +def gradual_training_scheduler(global_step, config): + """Setup the gradual training schedule wrt number + of active GPUs""" + num_gpus = torch.cuda.device_count() + if num_gpus == 0: + num_gpus = 1 + new_values = None + # we set the scheduling wrt num_gpus + for values in config.gradual_training: + if global_step * num_gpus >= values[0]: + new_values = values + return new_values[1], new_values[2] \ No newline at end of file From 3b2d726e2d35c3454377a8d4a67772b107f40eee Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 13:49:23 +0200 Subject: [PATCH 058/104] radam pytorch 1.5 update --- utils/radam.py | 139 +++++++++++++++---------------------------------- 1 file changed, 41 insertions(+), 98 deletions(-) diff --git a/utils/radam.py b/utils/radam.py index 62ecc695..738aac52 100644 --- a/utils/radam.py +++ b/utils/radam.py @@ -1,17 +1,31 @@ +# from https://github.com/LiyuanLucasLiu/RAdam + import math import torch -from torch.optim.optimizer import Optimizer +from torch.optim.optimizer import Optimizer, required -# adapted from https://github.com/LiyuanLucasLiu/RAdam class RAdam(Optimizer): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - self.buffer = [[None, None, None] for ind in range(10)] + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + self.degenerated_to_sgd = degenerated_to_sgd + if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + for param in params: + if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): + param['buffer'] = [[None, None, None] for _ in range(10)] + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)]) super(RAdam, self).__init__(params, defaults) - def __setstate__(self, state): # pylint: disable= useless-super-delegation + def __setstate__(self, state): super(RAdam, self).__setstate__(state) def step(self, closure=None): @@ -27,128 +41,57 @@ class RAdam(Optimizer): continue grad = p.grad.data.float() if grad.is_sparse: - raise RuntimeError( - 'RAdam does not support sparse gradients') + raise RuntimeError('RAdam does not support sparse gradients') p_data_fp32 = p.data.float() state = self.state[p] - if not state: + if len(state) == 0: state['step'] = 0 state['exp_avg'] = torch.zeros_like(p_data_fp32) state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) else: state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as( - p_data_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) state['step'] += 1 - buffered = self.buffer[int(state['step'] % 10)] + buffered = group['buffer'][int(state['step'] % 10)] if state['step'] == buffered[0]: N_sma, step_size = buffered[1], buffered[2] else: buffered[0] = state['step'] beta2_t = beta2 ** state['step'] N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * \ - state['step'] * beta2_t / (1 - beta2_t) + N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) buffered[1] = N_sma # more conservative since it's an approximated value if N_sma >= 5: - step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * ( - N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) + elif self.degenerated_to_sgd: + step_size = 1.0 / (1 - beta1 ** state['step']) else: - step_size = group['lr'] / (1 - beta1 ** state['step']) + step_size = -1 buffered[2] = step_size - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] - * group['lr'], p_data_fp32) - # more conservative since it's an approximated value if N_sma >= 5: + if group['weight_decay'] != 0: + p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr']) denom = exp_avg_sq.sqrt().add_(group['eps']) - p_data_fp32.addcdiv_(-step_size, exp_avg, denom) - else: - p_data_fp32.add_(-step_size, exp_avg) + p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) + p.data.copy_(p_data_fp32) + elif step_size > 0: + if group['weight_decay'] != 0: + p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr']) + p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr']) + p.data.copy_(p_data_fp32) - p.data.copy_(p_data_fp32) - - return loss - - -class PlainRAdam(Optimizer): - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - - super(PlainRAdam, self).__init__(params, defaults) - - def __setstate__(self, state): # pylint: disable= useless-super-delegation - super(PlainRAdam, self).__setstate__(state) - - def step(self, closure=None): - - loss = None - if closure is not None: - loss = closure() - - for group in self.param_groups: - - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data.float() - if grad.is_sparse: - raise RuntimeError( - 'RAdam does not support sparse gradients') - - p_data_fp32 = p.data.float() - - state = self.state[p] - - if not state: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) - else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as( - p_data_fp32) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] - - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - exp_avg.mul_(beta1).add_(1 - beta1, grad) - - state['step'] += 1 - beta2_t = beta2 ** state['step'] - N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) - - if group['weight_decay'] != 0: - p_data_fp32.add_(-group['weight_decay'] - * group['lr'], p_data_fp32) - - # more conservative since it's an approximated value - if N_sma >= 5: - step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * ( - N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) - denom = exp_avg_sq.sqrt().add_(group['eps']) - p_data_fp32.addcdiv_(-step_size, exp_avg, denom) - else: - step_size = group['lr'] / (1 - beta1 ** state['step']) - p_data_fp32.add_(-step_size, exp_avg) - - p.data.copy_(p_data_fp32) - - return loss + return loss \ No newline at end of file From 0ec42fa27935d7a6e905e3bf7bc065cb3b26503a Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 14:09:16 +0200 Subject: [PATCH 059/104] more agressive remove folder --- utils/generic_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 1a621744..9685f463 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -57,7 +57,7 @@ def remove_experiment_folder(experiment_path): checkpoint_files = glob.glob(experiment_path + "/*.pth.tar") if not checkpoint_files: if os.path.exists(experiment_path): - shutil.rmtree(experiment_path) + shutil.rmtree(experiment_path, ignore_errors=True) print(" ! Run is removed from {}".format(experiment_path)) else: print(" ! Run is kept in {}".format(experiment_path)) From 88bde77061ac093235d24cf0dfbef8133fc32358 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 14:09:28 +0200 Subject: [PATCH 060/104] fix checkpointing --- utils/io.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/utils/io.py b/utils/io.py index 9161d6fd..f6378336 100644 --- a/utils/io.py +++ b/utils/io.py @@ -47,9 +47,7 @@ def load_checkpoint(model, checkpoint_path, use_cuda=False): return model, state -def save_model(model, optimizer, current_step, epoch, r, output_folder, file_name, **kwargs): - checkpoint_path = os.path.join(output_folder, file_name) - +def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs): new_state_dict = model.state_dict() state = { 'model': new_state_dict, @@ -60,19 +58,21 @@ def save_model(model, optimizer, current_step, epoch, r, output_folder, file_nam 'r': model.decoder.r } state.update(kwargs) - torch.save(state, checkpoint_path) + torch.save(state, output_path) def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs): - print(" > CHECKPOINT : {}".format(checkpoint_path)) file_name = 'checkpoint_{}.pth.tar'.format(current_step) - save_model(model, optimizer, current_step, epoch ,r, output_folder, file_name, **kwargs) + checkpoint_path = os.path.join(output_folder, file_name) + print(" > CHECKPOINT : {}".format(checkpoint_path)) + save_model(model, optimizer, current_step, epoch ,r, checkpoint_path, **kwargs) def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs): if target_loss < best_loss: - print(" > BEST MODEL : {}".format(checkpoint_path)) file_name = 'best_model.pth.tar' - save_model(model, optimizer, current_step, epoch ,r, output_folder, file_name, model_loss=target_loss) + checkpoint_path = os.path.join(output_folder, file_name) + print(" > BEST MODEL : {}".format(checkpoint_path)) + save_model(model, optimizer, current_step, epoch ,r, checkpoint_path, model_loss=target_loss) best_loss = target_loss return best_loss \ No newline at end of file From bee288fa931e08f639f3b113be0543255ba8b76b Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 16:22:59 +0200 Subject: [PATCH 061/104] fixing console logging colors --- utils/console_logger.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/utils/console_logger.py b/utils/console_logger.py index 3ca29f96..5a37ac10 100644 --- a/utils/console_logger.py +++ b/utils/console_logger.py @@ -62,7 +62,7 @@ class ConsoleLogger(): print(log_text, flush=True) def print_eval_start(self): - print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}") + print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") def print_eval_step(self, step, loss_dict, avg_loss_dict): indent = " | > " @@ -78,14 +78,18 @@ class ConsoleLogger(): def print_epoch_end(self, epoch, avg_loss_dict): indent = " | > " - log_text = "\n {}--> EVAL PERFORMANCE{}\n".format( + log_text = " {}--> EVAL PERFORMANCE{}\n".format( tcolors.BOLD, tcolors.ENDC) for key, value in avg_loss_dict.items(): # print the avg value if given - color = tcolors.OKGREEN + color = tcolors.FAIL + sign = '+' + diff = 0 if self.old_eval_loss_dict is not None: - if self.old_eval_loss_dict[key] > value: - color = tcolors.FAIL - log_text += "{}{}:{} {:.5f} \n{}".format(indent, key, color, value, tcolors.ENDC) + diff = self.old_eval_loss_dict[key] - value + if diff > 0: + color = tcolors.OKGREEN + sign = '-' + log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff) self.old_eval_loss_dict = avg_loss_dict - print(log_text, flush=True) + print(log_text, flush=True) \ No newline at end of file From d282222553c97821f1028495fb7161d2f60b491d Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 28 Apr 2020 18:16:02 +0200 Subject: [PATCH 062/104] renaming layers to be converted to TF counterpart --- layers/common_layers.py | 14 ++-- layers/tacotron2.py | 148 +++++++++++++++++++--------------------- models/tacotron2.py | 2 +- 3 files changed, 80 insertions(+), 84 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index 8b7ed125..d2afe012 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -33,7 +33,7 @@ class LinearBN(nn.Module): super(LinearBN, self).__init__() self.linear_layer = torch.nn.Linear( in_features, out_features, bias=bias) - self.bn = nn.BatchNorm1d(out_features) + self.batch_normalization = nn.BatchNorm1d(out_features) self._init_w(init_gain) def _init_w(self, init_gain): @@ -45,7 +45,7 @@ class LinearBN(nn.Module): out = self.linear_layer(x) if len(out.shape) == 3: out = out.permute(1, 2, 0) - out = self.bn(out) + out = self.batch_normalization(out) if len(out.shape) == 3: out = out.permute(2, 0, 1) return out @@ -63,18 +63,18 @@ class Prenet(nn.Module): self.prenet_dropout = prenet_dropout in_features = [in_features] + out_features[:-1] if prenet_type == "bn": - self.layers = nn.ModuleList([ + self.linear_layers = nn.ModuleList([ LinearBN(in_size, out_size, bias=bias) for (in_size, out_size) in zip(in_features, out_features) ]) elif prenet_type == "original": - self.layers = nn.ModuleList([ + self.linear_layers = nn.ModuleList([ Linear(in_size, out_size, bias=bias) for (in_size, out_size) in zip(in_features, out_features) ]) def forward(self, x): - for linear in self.layers: + for linear in self.linear_layers: if self.prenet_dropout: x = F.dropout(F.relu(linear(x)), p=0.5, training=self.training) else: @@ -93,7 +93,7 @@ class LocationLayer(nn.Module): attention_n_filters=32, attention_kernel_size=31): super(LocationLayer, self).__init__() - self.location_conv = nn.Conv1d( + self.location_conv1d = nn.Conv1d( in_channels=2, out_channels=attention_n_filters, kernel_size=attention_kernel_size, @@ -104,7 +104,7 @@ class LocationLayer(nn.Module): attention_n_filters, attention_dim, bias=False, init_gain='tanh') def forward(self, attention_cat): - processed_attention = self.location_conv(attention_cat) + processed_attention = self.location_conv1d(attention_cat) processed_attention = self.location_dense( processed_attention.transpose(1, 2)) return processed_attention diff --git a/layers/tacotron2.py b/layers/tacotron2.py index fa76a6b2..3e439b9b 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -6,130 +6,126 @@ from .common_layers import init_attn, Prenet, Linear class ConvBNBlock(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, nonlinear=None): + def __init__(self, in_channels, out_channels, kernel_size, activation=None): super(ConvBNBlock, self).__init__() assert (kernel_size - 1) % 2 == 0 padding = (kernel_size - 1) // 2 - conv1d = nn.Conv1d(in_channels, + self.convolution1d = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding) - norm = nn.BatchNorm1d(out_channels) - dropout = nn.Dropout(p=0.5) - if nonlinear == 'relu': - self.net = nn.Sequential(conv1d, norm, nn.ReLU(), dropout) - elif nonlinear == 'tanh': - self.net = nn.Sequential(conv1d, norm, nn.Tanh(), dropout) + self.batch_normalization = nn.BatchNorm1d(out_channels) + self.dropout = nn.Dropout(p=0.5) + if activation == 'relu': + self.activation = nn.ReLU() + elif activation == 'tanh': + self.activation = nn.Tanh() else: - self.net = nn.Sequential(conv1d, norm, dropout) + self.activation = nn.Identity() def forward(self, x): - output = self.net(x) - return output + o = self.convolution1d(x) + o = self.batch_normalization(o) + o = self.activation(o) + o = self.dropout(o) + return o class Postnet(nn.Module): - def __init__(self, mel_dim, num_convs=5): + def __init__(self, output_dim, num_convs=5): super(Postnet, self).__init__() self.convolutions = nn.ModuleList() self.convolutions.append( - ConvBNBlock(mel_dim, 512, kernel_size=5, nonlinear='tanh')) + ConvBNBlock(output_dim, 512, kernel_size=5, activation='tanh')) for _ in range(1, num_convs - 1): self.convolutions.append( - ConvBNBlock(512, 512, kernel_size=5, nonlinear='tanh')) + ConvBNBlock(512, 512, kernel_size=5, activation='tanh')) self.convolutions.append( - ConvBNBlock(512, mel_dim, kernel_size=5, nonlinear=None)) + ConvBNBlock(512, output_dim, kernel_size=5, activation=None)) def forward(self, x): + o = x for layer in self.convolutions: - x = layer(x) - return x + o = layer(o) + return o class Encoder(nn.Module): - def __init__(self, in_features=512): + def __init__(self, output_input_dim=512): super(Encoder, self).__init__() - convolutions = [] + self.convolutions = nn.ModuleList() for _ in range(3): - convolutions.append( - ConvBNBlock(in_features, in_features, 5, 'relu')) - self.convolutions = nn.Sequential(*convolutions) - self.lstm = nn.LSTM(in_features, - int(in_features / 2), + self.convolutions.append( + ConvBNBlock(output_input_dim, output_input_dim, 5, 'relu')) + self.lstm = nn.LSTM(output_input_dim, + int(output_input_dim / 2), num_layers=1, batch_first=True, bidirectional=True) self.rnn_state = None def forward(self, x, input_lengths): - x = self.convolutions(x) - x = x.transpose(1, 2) - x = nn.utils.rnn.pack_padded_sequence(x, + o = x + for layer in self.convolutions: + o = layer(o) + o = o.transpose(1, 2) + o = nn.utils.rnn.pack_padded_sequence(o, input_lengths, batch_first=True) self.lstm.flatten_parameters() - outputs, _ = self.lstm(x) - outputs, _ = nn.utils.rnn.pad_packed_sequence( - outputs, - batch_first=True, - ) - return outputs + o, _ = self.lstm(o) + o, _ = nn.utils.rnn.pad_packed_sequence(o, batch_first=True) + return o def inference(self, x): - x = self.convolutions(x) - x = x.transpose(1, 2) + o = x + for layer in self.convolutions: + o = layer(o) + o = x.transpose(1, 2) self.lstm.flatten_parameters() - outputs, _ = self.lstm(x) - return outputs - - def inference_truncated(self, x): - """ - Preserve encoder state for continuous inference - """ - x = self.convolutions(x) - x = x.transpose(1, 2) - self.lstm.flatten_parameters() - outputs, self.rnn_state = self.lstm(x, self.rnn_state) - return outputs + o, _ = self.lstm(o) + return o # adapted from https://github.com/NVIDIA/tacotron2/ class Decoder(nn.Module): # Pylint gets confused by PyTorch conventions here #pylint: disable=attribute-defined-outside-init - def __init__(self, in_features, memory_dim, r, attn_type, attn_win, attn_norm, + def __init__(self, input_dim, frame_dim, r, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, speaker_embedding_dim): super(Decoder, self).__init__() - self.memory_dim = memory_dim + self.frame_dim = frame_dim self.r_init = r self.r = r - self.encoder_embedding_dim = in_features + self.encoder_embedding_dim = input_dim self.separate_stopnet = separate_stopnet + self.max_decoder_steps = 1000 + self.gate_threshold = 0.5 + + # model dimensions self.query_dim = 1024 self.decoder_rnn_dim = 1024 self.prenet_dim = 256 - self.max_decoder_steps = 1000 - self.gate_threshold = 0.5 + self.attn_dim = 128 self.p_attention_dropout = 0.1 self.p_decoder_dropout = 0.1 # memory -> |Prenet| -> processed_memory - prenet_dim = self.memory_dim - self.prenet = Prenet( - prenet_dim, - prenet_type, - prenet_dropout, - out_features=[self.prenet_dim, self.prenet_dim], - bias=False) + prenet_dim = self.frame_dim + self.prenet = Prenet(prenet_dim, + prenet_type, + prenet_dropout, + out_features=[self.prenet_dim, self.prenet_dim], + bias=False) - self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features, + self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim, self.query_dim) self.attention = init_attn(attn_type=attn_type, query_dim=self.query_dim, - embedding_dim=in_features, + embedding_dim=input_dim, attention_dim=128, location_attention=location_attn, attention_location_n_filters=32, @@ -141,15 +137,15 @@ class Decoder(nn.Module): forward_attn_mask=forward_attn_mask, attn_K=attn_K) - self.decoder_rnn = nn.LSTMCell(self.query_dim + in_features, + self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim, self.decoder_rnn_dim, 1) - self.linear_projection = Linear(self.decoder_rnn_dim + in_features, - self.memory_dim * self.r_init) + self.linear_projection = Linear(self.decoder_rnn_dim + input_dim, + self.frame_dim * self.r_init) self.stopnet = nn.Sequential( nn.Dropout(0.1), - Linear(self.decoder_rnn_dim + self.memory_dim * self.r_init, + Linear(self.decoder_rnn_dim + self.frame_dim * self.r_init, 1, bias=True, init_gain='sigmoid')) @@ -161,7 +157,7 @@ class Decoder(nn.Module): def get_go_frame(self, inputs): B = inputs.size(0) memory = torch.zeros(1, device=inputs.device).repeat(B, - self.memory_dim * self.r) + self.frame_dim * self.r) return memory def _init_states(self, inputs, mask, keep_states=False): @@ -187,9 +183,9 @@ class Decoder(nn.Module): Reshape the spectrograms for given 'r' """ # Grouping multiple frames if necessary - if memory.size(-1) == self.memory_dim: + if memory.size(-1) == self.frame_dim: memory = memory.view(memory.shape[0], memory.size(1) // self.r, -1) - # Time first (T_decoder, B, memory_dim) + # Time first (T_decoder, B, frame_dim) memory = memory.transpose(0, 1) return memory @@ -197,22 +193,22 @@ class Decoder(nn.Module): alignments = torch.stack(alignments).transpose(0, 1) stop_tokens = torch.stack(stop_tokens).transpose(0, 1) outputs = torch.stack(outputs).transpose(0, 1).contiguous() - outputs = outputs.view(outputs.size(0), -1, self.memory_dim) + outputs = outputs.view(outputs.size(0), -1, self.frame_dim) outputs = outputs.transpose(1, 2) return outputs, stop_tokens, alignments def _update_memory(self, memory): if len(memory.shape) == 2: - return memory[:, self.memory_dim * (self.r - 1):] - return memory[:, :, self.memory_dim * (self.r - 1):] + return memory[:, self.frame_dim * (self.r - 1):] + return memory[:, :, self.frame_dim * (self.r - 1):] def decode(self, memory): ''' shapes: - - memory: B x r * self.memory_dim + - memory: B x r * self.frame_dim ''' # self.context: B x D_en - # query_input: B x D_en + (r * self.memory_dim) + # query_input: B x D_en + (r * self.frame_dim) query_input = torch.cat((memory, self.context), -1) # self.query and self.attention_rnn_cell_state : B x D_attn_rnn self.query, self.attention_rnn_cell_state = self.attention_rnn( @@ -235,16 +231,16 @@ class Decoder(nn.Module): # B x (D_decoder_rnn + D_en) decoder_hidden_context = torch.cat((self.decoder_hidden, self.context), dim=1) - # B x (self.r * self.memory_dim) + # B x (self.r * self.frame_dim) decoder_output = self.linear_projection(decoder_hidden_context) - # B x (D_decoder_rnn + (self.r * self.memory_dim)) + # B x (D_decoder_rnn + (self.r * self.frame_dim)) stopnet_input = torch.cat((self.decoder_hidden, decoder_output), dim=1) if self.separate_stopnet: stop_token = self.stopnet(stopnet_input.detach()) else: stop_token = self.stopnet(stopnet_input) # select outputs for the reduction rate self.r - decoder_output = decoder_output[:, :self.r * self.memory_dim] + decoder_output = decoder_output[:, :self.r * self.frame_dim] return decoder_output, self.attention.attention_weights, stop_token def forward(self, inputs, memories, mask, speaker_embeddings=None): diff --git a/models/tacotron2.py b/models/tacotron2.py index d530774a..3e7adfca 100644 --- a/models/tacotron2.py +++ b/models/tacotron2.py @@ -29,7 +29,7 @@ class Tacotron2(nn.Module): super(Tacotron2, self).__init__() self.postnet_output_dim = postnet_output_dim self.decoder_output_dim = decoder_output_dim - self.n_frames_per_step = r + self.r = r self.bidirectional_decoder = bidirectional_decoder decoder_dim = 512 if num_speakers > 1 else 512 encoder_dim = 512 if num_speakers > 1 else 512 From 736f169cc99f13e2fd7534df3a43d12147bc367b Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 28 Apr 2020 18:16:37 +0200 Subject: [PATCH 063/104] tf lstm does not match torch lstm wrt bias vectors. So I avoid bias in LSTM as an easy solution. --- layers/tacotron2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 3e439b9b..35a5c0bb 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -61,6 +61,7 @@ class Encoder(nn.Module): int(output_input_dim / 2), num_layers=1, batch_first=True, + bias=False, bidirectional=True) self.rnn_state = None @@ -121,7 +122,8 @@ class Decoder(nn.Module): bias=False) self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim, - self.query_dim) + self.query_dim, + bias=False) self.attention = init_attn(attn_type=attn_type, query_dim=self.query_dim, From de2918c85b5afb2648d2f39a0a47fcee204ba101 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 1 May 2020 14:34:14 +0200 Subject: [PATCH 064/104] bug fixes --- layers/tacotron2.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 35a5c0bb..10c03570 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -82,8 +82,8 @@ class Encoder(nn.Module): o = x for layer in self.convolutions: o = layer(o) - o = x.transpose(1, 2) - self.lstm.flatten_parameters() + o = o.transpose(1, 2) + # self.lstm.flatten_parameters() o, _ = self.lstm(o) return o @@ -140,7 +140,8 @@ class Decoder(nn.Module): attn_K=attn_K) self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim, - self.decoder_rnn_dim, 1) + self.decoder_rnn_dim, + bias=False) self.linear_projection = Linear(self.decoder_rnn_dim + input_dim, self.frame_dim * self.r_init) From 9504b71f79cd58cad456654aa4d28740662ff3d8 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 1 May 2020 23:06:51 +0200 Subject: [PATCH 065/104] fix lstm biases True --- layers/tacotron2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 10c03570..4454c89e 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -123,7 +123,7 @@ class Decoder(nn.Module): self.attention_rnn = nn.LSTMCell(self.prenet_dim + input_dim, self.query_dim, - bias=False) + bias=True) self.attention = init_attn(attn_type=attn_type, query_dim=self.query_dim, @@ -141,7 +141,7 @@ class Decoder(nn.Module): self.decoder_rnn = nn.LSTMCell(self.query_dim + input_dim, self.decoder_rnn_dim, - bias=False) + bias=True) self.linear_projection = Linear(self.decoder_rnn_dim + input_dim, self.frame_dim * self.r_init) From 6f5c8773d6486e86418496913f53b0c8ec82d087 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 4 May 2020 21:03:03 +0200 Subject: [PATCH 066/104] enable encoder lstm bias --- layers/tacotron2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 4454c89e..b9aec6fe 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -61,7 +61,7 @@ class Encoder(nn.Module): int(output_input_dim / 2), num_layers=1, batch_first=True, - bias=False, + bias=True, bidirectional=True) self.rnn_state = None From d99fda8e42c0f131ed138a1fb29e188819977093 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 5 May 2020 17:36:12 +0200 Subject: [PATCH 067/104] init batch norm explicit initial values --- layers/common_layers.py | 2 +- layers/tacotron2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/layers/common_layers.py b/layers/common_layers.py index d2afe012..24433269 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -33,7 +33,7 @@ class LinearBN(nn.Module): super(LinearBN, self).__init__() self.linear_layer = torch.nn.Linear( in_features, out_features, bias=bias) - self.batch_normalization = nn.BatchNorm1d(out_features) + self.batch_normalization = nn.BatchNorm1d(out_features, momentum=0.1, eps=1e-5) self._init_w(init_gain) def _init_w(self, init_gain): diff --git a/layers/tacotron2.py b/layers/tacotron2.py index b9aec6fe..bdb169be 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -14,7 +14,7 @@ class ConvBNBlock(nn.Module): out_channels, kernel_size, padding=padding) - self.batch_normalization = nn.BatchNorm1d(out_channels) + self.batch_normalization = nn.BatchNorm1d(out_channels, momentum=0.1, eps=1e-5) self.dropout = nn.Dropout(p=0.5) if activation == 'relu': self.activation = nn.ReLU() From b3ec50b5c4f1bebfea642af0298ce62a5c3bc518 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 6 May 2020 16:37:30 +0200 Subject: [PATCH 068/104] tf bacend for synthesis --- utils/synthesis.py | 91 +++++++++++++++++++++++++++++++++++----------- utils/visual.py | 1 - 2 files changed, 70 insertions(+), 22 deletions(-) diff --git a/utils/synthesis.py b/utils/synthesis.py index 9158ef02..0c68dbf2 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -1,3 +1,7 @@ +import pkg_resources +installed = {pkg.key for pkg in pkg_resources.working_set} +if 'tensorflow' in installed: + import tensorflow as tf import torch import numpy as np from .text import text_to_sequence, phoneme_to_sequence @@ -14,23 +18,32 @@ def text_to_seqvec(text, CONFIG, use_cuda): dtype=np.int32) else: seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32) - # torch tensor - chars_var = torch.from_numpy(seq).unsqueeze(0) - if use_cuda: - chars_var = chars_var.cuda() - return chars_var.long() + return seq + + +def numpy_to_torch(np_array, dtype, cuda=False): + if np_array is None: + return None + tensor = torch.Tensor(np_array, dtype=dtype) + if cuda: + return tensor.cuda() + return tensor + + +def numpy_to_tf(np_array, dtype): + if np_array is None: + return None + tensor = tf.convert_to_tensor(np_array, dtype=dtype) + return tensor def compute_style_mel(style_wav, ap, use_cuda): - print(style_wav) - style_mel = torch.FloatTensor(ap.melspectrogram( - ap.load_wav(style_wav))).unsqueeze(0) - if use_cuda: - return style_mel.cuda() + style_mel = ap.melspectrogram( + ap.load_wav(style_wav)).expand_dims(0) return style_mel -def run_model(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): +def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): if CONFIG.use_gst: decoder_output, postnet_output, alignments, stop_tokens = model.inference( inputs, style_mel=style_mel, speaker_ids=speaker_id) @@ -44,11 +57,31 @@ def run_model(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None) return decoder_output, postnet_output, alignments, stop_tokens -def parse_outputs(postnet_output, decoder_output, alignments): +def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): + if CONFIG.use_gst: + raise NotImplemented(' [!] GST inference not implemented for TF') + if truncated: + raise NotImplemented(' [!] Truncated inference not implemented for TF') + # TODO: handle multispeaker case + decoder_output, postnet_output, alignments, stop_tokens = model( + inputs, training=False) + return decoder_output, postnet_output, alignments, stop_tokens + + +def parse_outputs_torch(postnet_output, decoder_output, alignments, stop_tokens): postnet_output = postnet_output[0].data.cpu().numpy() decoder_output = decoder_output[0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() - return postnet_output, decoder_output, alignment + stop_tokens = stop_tokens[0].cpu().numpy() + return postnet_output, decoder_output, alignment, stop_tokens + + +def parse_outputs_tf(postnet_output, decoder_output, alignments, stop_tokens): + postnet_output = postnet_output[0].numpy() + decoder_output = decoder_output[0].numpy() + alignment = alignments[0].numpy() + stop_tokens = stop_tokens[0].numpy() + return postnet_output, decoder_output, alignment, stop_tokens def trim_silence(wav, ap): @@ -98,7 +131,8 @@ def synthesis(model, truncated=False, enable_eos_bos_chars=False, #pylint: disable=unused-argument use_griffin_lim=False, - do_trim_silence=False): + do_trim_silence=False, + backend='torch'): """Synthesize voice for the given text. Args: @@ -114,6 +148,7 @@ def synthesis(model, for continuous inference at long texts. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. do_trim_silence (bool): trim silence after synthesis. + backend (str): tf or torch """ # GST processing style_mel = None @@ -121,15 +156,29 @@ def synthesis(model, style_mel = compute_style_mel(style_wav, ap, use_cuda) # preprocess the given text inputs = text_to_seqvec(text, CONFIG, use_cuda) - speaker_id = id_to_torch(speaker_id) - if speaker_id is not None and use_cuda: - speaker_id = speaker_id.cuda() + # pass tensors to backend + if backend == 'torch': + speaker_id = id_to_torch(speaker_id) + style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) + inputs = numpy_to_torch(inputs, torch.long, cuda=use_cuda) + inputs = inputs.unsqueeze(0) + else: + # TODO: handle speaker id for tf model + style_mel = numpy_to_tf(style_mel, tf.float32) + inputs = numpy_to_tf(inputs, tf.int32) + inputs = tf.expand_dims(inputs, 0) # synthesize voice - decoder_output, postnet_output, alignments, stop_tokens = run_model( - model, inputs, CONFIG, truncated, speaker_id, style_mel) + if backend == 'torch': + decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( + model, inputs, CONFIG, truncated, speaker_id, style_mel) + postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_torch( + postnet_output, decoder_output, alignments, stop_tokens) + else: + decoder_output, postnet_output, alignments, stop_tokens = run_model_tf( + model, inputs, CONFIG, truncated, speaker_id, style_mel) + postnet_output, decoder_output, alignment, stop_tokens = parse_outputs_tf( + postnet_output, decoder_output, alignments, stop_tokens) # convert outputs to numpy - postnet_output, decoder_output, alignment = parse_outputs( - postnet_output, decoder_output, alignments) # plot results wav = None if use_griffin_lim: diff --git a/utils/visual.py b/utils/visual.py index 8789cf8f..87fbc8e4 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -61,7 +61,6 @@ def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, plt.yticks(range(len(text)), list(text)) plt.colorbar() # plot stopnet predictions - stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy() plt.subplot(num_plot, 1, 2) plt.plot(range(len(stop_tokens)), list(stop_tokens)) # plot postnet spectrogram From 84c5c4a5871ffe3f87e01feee429a9ffec154c0e Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 13:46:16 +0200 Subject: [PATCH 069/104] config remove empty chars --- config.json | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/config.json b/config.json index da3fe286..c23bd004 100644 --- a/config.json +++ b/config.json @@ -1,5 +1,5 @@ { - "model": "Tacotron2", + "model": "Tacotron2", "run_name": "ljspeech", "run_description": "tacotron2", @@ -11,12 +11,12 @@ "hop_length": 256, // stft window hop-lengh in ms. "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - + // Audio processing parameters "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - + // Silence trimming "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) "trim_db": 60, // threshold for timming silence. Set this according to your dataset. @@ -26,7 +26,7 @@ "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. + "num_mels": 80, // size of the mel spec frame. "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! @@ -50,7 +50,7 @@ // "punctuations":"!'(),-.:;? ", // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" // }, - + // DISTRIBUTED TRAINING "distributed":{ "backend": "nccl", @@ -61,8 +61,8 @@ // TRAINING "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":16, - "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "eval_batch_size":16, + "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. @@ -80,11 +80,11 @@ "wd": 0.000001, // Weight decay weight. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - + // TACOTRON PRENET - "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. "prenet_type": "original", // "original" or "bn". - "prenet_dropout": true, // enable/disable dropout at prenet. + "prenet_dropout": true, // enable/disable dropout at prenet. // ATTENTION "attention_type": "original", // 'original' or 'graves' @@ -98,16 +98,16 @@ "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. // STOPNET - "stopnet": true, // Train stopnet predicting the end of synthesis. + "stopnet": true, // Train stopnet predicting the end of synthesis. "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. // TENSORBOARD and LOGGING "print_step": 25, // Number of steps to log traning on console. - "print_eval": false, // If True, it prints loss values in evalulation. + "print_eval": false, // If True, it prints loss values in evalulation. "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + // DATA LOADING "text_cleaner": "phoneme_cleaners", "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. @@ -119,7 +119,7 @@ // PATHS "output_path": "/home/erogol/Models/LJSpeech/", - + // PHONEMES "phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. From 68dbcee746a775df42250b1940b54e88ae670e62 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 13:49:49 +0200 Subject: [PATCH 070/104] import condition update for synthesis with TF --- utils/synthesis.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/synthesis.py b/utils/synthesis.py index 0c68dbf2..ae3a7df7 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -1,6 +1,6 @@ import pkg_resources installed = {pkg.key for pkg in pkg_resources.working_set} -if 'tensorflow' in installed: +if 'tensorflow' in installed or 'tensorflow-gpu' in installed: import tensorflow as tf import torch import numpy as np @@ -24,9 +24,9 @@ def text_to_seqvec(text, CONFIG, use_cuda): def numpy_to_torch(np_array, dtype, cuda=False): if np_array is None: return None - tensor = torch.Tensor(np_array, dtype=dtype) + tensor = torch.Tensor(np_array, dtype=dtype) if cuda: - return tensor.cuda() + return tensor.cuda() return tensor From 1cd25ccf0d3842fc954263f5d538827cfb82e040 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 12 May 2020 16:16:55 +0200 Subject: [PATCH 071/104] bug fix --- utils/synthesis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/synthesis.py b/utils/synthesis.py index ae3a7df7..188a3acf 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -24,7 +24,7 @@ def text_to_seqvec(text, CONFIG, use_cuda): def numpy_to_torch(np_array, dtype, cuda=False): if np_array is None: return None - tensor = torch.Tensor(np_array, dtype=dtype) + tensor = torch.as_tensor(np_array, dtype=dtype) if cuda: return tensor.cuda() return tensor From 85a822e3190f6c7207c28f7485ecb0ebe7e480ba Mon Sep 17 00:00:00 2001 From: mittimithai Date: Tue, 12 May 2020 15:02:24 -0700 Subject: [PATCH 072/104] small change for multispeaker just threads speaker_id through decoder.run_model --- server/synthesizer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index e9205bf1..411be928 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -164,16 +164,21 @@ class Synthesizer(object): sentences = list(filter(None, [s.strip() for s in sentences])) # remove empty sentences return sentences - def tts(self, text): + def tts(self, text, speaker_id=None): wavs = [] sens = self.split_into_sentences(text) print(sens) + + speaker_id = id_to_torch(speaker_id) ++ if speaker_id is not None and self.use_cuda: ++ speaker_id = speaker_id.cuda() + for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) # synthesize voice decoder_output, postnet_output, alignments, _ = run_model( - self.tts_model, inputs, self.tts_config, False, None, None) + self.tts_model, inputs, self.tts_config, False, speaker_id, None) # convert outputs to numpy postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) From a4aca623c34243e065bd49ee5520fd714e461c1a Mon Sep 17 00:00:00 2001 From: mittimithai Date: Tue, 12 May 2020 15:23:45 -0700 Subject: [PATCH 073/104] removed + chars silly mistake copy pasting --- server/synthesizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 411be928..dee2cd4b 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -170,8 +170,8 @@ class Synthesizer(object): print(sens) speaker_id = id_to_torch(speaker_id) -+ if speaker_id is not None and self.use_cuda: -+ speaker_id = speaker_id.cuda() + if speaker_id is not None and self.use_cuda: + speaker_id = speaker_id.cuda() for sen in sens: # preprocess the given text From 42ff83f9b9e1151ccaba6d7f9d8f520d274a0d31 Mon Sep 17 00:00:00 2001 From: mittimithai Date: Tue, 12 May 2020 18:50:58 -0700 Subject: [PATCH 074/104] trying to fix trailing whitespace --- server/synthesizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index dee2cd4b..3035a287 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -172,7 +172,7 @@ class Synthesizer(object): speaker_id = id_to_torch(speaker_id) if speaker_id is not None and self.use_cuda: speaker_id = speaker_id.cuda() - + for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) From 25f466f2994dad707f0aa00db36187d6a43844ee Mon Sep 17 00:00:00 2001 From: mittimithai Date: Tue, 12 May 2020 19:02:37 -0700 Subject: [PATCH 075/104] more whitespace problems --- server/synthesizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 3035a287..453e5827 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -168,7 +168,6 @@ class Synthesizer(object): wavs = [] sens = self.split_into_sentences(text) print(sens) - speaker_id = id_to_torch(speaker_id) if speaker_id is not None and self.use_cuda: speaker_id = speaker_id.cuda() From d5d9e6e8ea87995a5679d78b24e7df2a3c88e185 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 13 May 2020 13:52:17 +0200 Subject: [PATCH 076/104] bug fix --- utils/synthesis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/synthesis.py b/utils/synthesis.py index 188a3acf..3903ba44 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -64,7 +64,7 @@ def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=No raise NotImplemented(' [!] Truncated inference not implemented for TF') # TODO: handle multispeaker case decoder_output, postnet_output, alignments, stop_tokens = model( - inputs, training=False) + inputs) return decoder_output, postnet_output, alignments, stop_tokens From 65b9c7d3d655e34741ff5adf75bc1b2a6df22158 Mon Sep 17 00:00:00 2001 From: thllwg Date: Fri, 15 May 2020 10:19:52 +0200 Subject: [PATCH 077/104] number of workers as config parameter --- speaker_encoder/config.json | 1 + speaker_encoder/train.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/speaker_encoder/config.json b/speaker_encoder/config.json index 79c42bc0..0d0f8f68 100644 --- a/speaker_encoder/config.json +++ b/speaker_encoder/config.json @@ -34,6 +34,7 @@ "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 1, // Number of steps to log traning on console. "output_path": "/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. "model": { "input_dim": 40, "proj_dim": 128, diff --git a/speaker_encoder/train.py b/speaker_encoder/train.py index 19067401..0a137360 100644 --- a/speaker_encoder/train.py +++ b/speaker_encoder/train.py @@ -44,7 +44,7 @@ def setup_loader(ap, is_val=False, verbose=False): loader = DataLoader(dataset, batch_size=c.num_speakers_in_batch, shuffle=False, - num_workers=0, + num_workers=c.num_loader_workers, collate_fn=dataset.collate_fn) return loader From 67397be1c096a37e0d4e97a729f8c1c144feca35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 18 May 2020 11:02:36 +0200 Subject: [PATCH 078/104] tf folder add --- tf/README.md | 4 + tf/convert_tacotron2_torch_to_tf.py | 196 +++++++ tf/layers/common_layers.py | 258 ++++++++++ tf/layers/tacotron2.py | 231 +++++++++ tf/models/tacotron2.py | 72 +++ tf/notebooks/Benchmark-TTS_tf.ipynb | 708 ++++++++++++++++++++++++++ tf/requirements | 2 + tf/utils/convert_torch_to_tf_utils.py | 83 +++ tf/utils/generic_utils.py | 105 ++++ tf/utils/tf_utils.py | 8 + 10 files changed, 1667 insertions(+) create mode 100644 tf/README.md create mode 100644 tf/convert_tacotron2_torch_to_tf.py create mode 100644 tf/layers/common_layers.py create mode 100644 tf/layers/tacotron2.py create mode 100644 tf/models/tacotron2.py create mode 100644 tf/notebooks/Benchmark-TTS_tf.ipynb create mode 100644 tf/requirements create mode 100644 tf/utils/convert_torch_to_tf_utils.py create mode 100644 tf/utils/generic_utils.py create mode 100644 tf/utils/tf_utils.py diff --git a/tf/README.md b/tf/README.md new file mode 100644 index 00000000..24e09a06 --- /dev/null +++ b/tf/README.md @@ -0,0 +1,4 @@ +## Utilities to Convert Models to Tensorflow2 +You can find some utilities to convert Torch models to Tensorflow with an experimental Tacotron2 implemenation in Tensorflow2 (>=2.2). However, our released Torch models may not work with this module due to additional changes layer naming convention. Therefore, you need to train new models to be converted to TF. + +This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own. \ No newline at end of file diff --git a/tf/convert_tacotron2_torch_to_tf.py b/tf/convert_tacotron2_torch_to_tf.py new file mode 100644 index 00000000..512b0a4d --- /dev/null +++ b/tf/convert_tacotron2_torch_to_tf.py @@ -0,0 +1,196 @@ +# %% +import sys +sys.path.append('/home/erogol/Projects') +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '' +# %% +import argparse +import numpy as np +import torch +import tensorflow as tf +from fuzzywuzzy import fuzz + +from TTS.utils.text.symbols import make_symbols, phonemes, symbols +from TTS.utils.generic_utils import setup_model, count_parameters +from TTS.utils.io import load_config +from TTS_tf.models.tacotron2 import Tacotron2 +from TTS_tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name +from TTS_tf.utils.generic_utils import save_checkpoint + + +parser = argparse.ArgumentParser() +parser.add_argument( + '--torch_model_path', + type=str, + help='Path to target torch model to be converted to TF.') +parser.add_argument( + '--config_path', + type=str, + help='Path to config file of torch model.') +parser.add_argument( + '--output_path', + type=str, + help='path to save TF model weights.') +args = parser.parse_args() + +# load model config +config_path = args.config_path +c = load_config(config_path) +num_speakers = 0 + +# init torch model +num_chars = len(phonemes) if c.use_phonemes else len(symbols) +model = setup_model(num_chars, num_speakers, c) +checkpoint = torch.load(args.torch_model_path, map_location=torch.device('cpu')) +state_dict = checkpoint['model'] +model.load_state_dict(state_dict) + +# init tf model +model_tf = Tacotron2(num_chars=num_chars, + num_speakers=num_speakers, + r=model.decoder.r, + postnet_output_dim=c.audio['num_mels'], + decoder_output_dim=c.audio['num_mels'], + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder) + +# set initial layer mapping - these are not captured by the below heuristic approach +# TODO: set layer names so that we can remove these manual matching +common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE' +var_map = [ + ('tacotron2/embedding/embeddings:0', 'embedding.weight'), + ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/kernel:0', 'encoder.lstm.weight_ih_l0'), + ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0', 'encoder.lstm.weight_hh_l0'), + ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/kernel:0', 'encoder.lstm.weight_ih_l0_reverse'), + ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0', 'encoder.lstm.weight_hh_l0_reverse'), + ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/bias:0', ('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')), + ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/bias:0', ('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')), + ('attention/v/kernel:0', 'decoder.attention.v.linear_layer.weight'), + ('decoder/linear_projection/kernel:0', 'decoder.linear_projection.linear_layer.weight'), + ('decoder/stopnet/kernel:0', 'decoder.stopnet.1.linear_layer.weight') +] + + +# %% +# get tf_model graph +input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs() +mel_pred = model_tf(input_ids, training=False) + +# get tf variables +tf_vars = model_tf.weights + +# match variable names with fuzzy logic +torch_var_names = list(state_dict.keys()) +tf_var_names = [we.name for we in model_tf.weights] +for tf_name in tf_var_names: + # skip re-mapped layer names + if tf_name in [name[0] for name in var_map]: + continue + tf_name_edited = convert_tf_name(tf_name) + ratios = [fuzz.ratio(torch_name, tf_name_edited) for torch_name in torch_var_names] + max_idx = np.argmax(ratios) + matching_name = torch_var_names[max_idx] + del torch_var_names[max_idx] + var_map.append((tf_name, matching_name)) + + +# %% +# print variable match +from pprint import pprint +pprint(var_map) +pprint(torch_var_names) + +# pass weights +tf_vars = transfer_weights_torch_to_tf(tf_vars, dict(var_map), state_dict) + +# Compare TF and TORCH models +# %% +# check embedding outputs +model.eval() +input_ids = torch.randint(0, 24, (1, 128)).long() + +o_t = model.embedding(input_ids) +o_tf = model_tf.embedding(input_ids.detach().numpy()) +assert abs(o_t.detach().numpy() - o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() - o_tf.numpy()).sum() + +# compare encoder outputs +oo_en = model.encoder.inference(o_t.transpose(1,2)) +ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False) +assert compare_torch_tf(oo_en, ooo_en) < 1e-5 + +# compare decoder.attention_rnn +inp = torch.rand([1, 768]) +inp_tf = inp.numpy() +model.decoder._init_states(oo_en, mask=None) +output, cell_state = model.decoder.attention_rnn(inp) +states = model_tf.decoder.build_decoder_initial_states(1,512,128) +output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf, states[2], training=False) +assert compare_torch_tf(output, output_tf).mean() < 1e-5 + +# compare decoder.attention +query = output +inputs = torch.rand([1, 128, 512]) +query_tf = query.detach().numpy() +inputs_tf = inputs.numpy() + +model.decoder.attention.init_states(inputs) +processes_inputs = model.decoder.attention.preprocess_inputs(inputs) +loc_attn, proc_query = model.decoder.attention.get_location_attention(query, processes_inputs) +context = model.decoder.attention(query, inputs, processes_inputs, None) + +model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf)) +loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf) +context_tf = model_tf.decoder.attention(query_tf, training=False) + +assert compare_torch_tf(loc_attn, loc_attn_tf).mean() < 1e-5 +assert compare_torch_tf(proc_query, proc_query_tf).mean() < 1e-5 +assert compare_torch_tf(context, context_tf) < 1e-5 + +# compare decoder.decoder_rnn +input = torch.rand([1, 1536]) +input_tf = input.numpy() +model.decoder._init_states(oo_en, mask=None) +output, cell_state = model.decoder.decoder_rnn(input, [model.decoder.decoder_hidden, model.decoder.decoder_cell]) +states = model_tf.decoder.build_decoder_initial_states(1,512,128) +output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf, states[3], training=False) +assert abs(input - input_tf).mean() < 1e-5 +assert compare_torch_tf(output, output_tf).mean() < 1e-5 + +# compare decoder.linear_projection +input = torch.rand([1, 1536]) +input_tf = input.numpy() +output = model.decoder.linear_projection(input) +output_tf = model_tf.decoder.linear_projection(input_tf, training=False) +assert compare_torch_tf(output, output_tf) < 1e-5 + +# compare decoder outputs +model.decoder.max_decoder_steps = 100 +model_tf.decoder.set_max_decoder_steps(100) +output, align, stop = model.decoder.inference(oo_en) +states = model_tf.decoder.build_decoder_initial_states(1,512,128) +output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False) +assert compare_torch_tf(output.transpose(1,2), output_tf) < 1e-4 + +# compare the whole model output +outputs_torch = model.inference(input_ids) +outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy())) +print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean() ) +assert compare_torch_tf(outputs_torch[2][:, 50, :], outputs_tf[2][:, 50, :]) < 1e-5 +assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4 + +# %% +# save tf model +save_checkpoint(model_tf, None, checkpoint['step'], checkpoint['epoch'], + checkpoint['r'], args.output_path) +print(' > Model conversion is successfully completed :).') + diff --git a/tf/layers/common_layers.py b/tf/layers/common_layers.py new file mode 100644 index 00000000..fba06e0b --- /dev/null +++ b/tf/layers/common_layers.py @@ -0,0 +1,258 @@ +import tensorflow as tf +from tensorflow import keras +from tensorflow.python.ops import math_ops +# from tensorflow_addons.seq2seq import BahdanauAttention + +from TTS.tf.utils.tf_utils import shape_list + + +class Linear(keras.layers.Layer): + def __init__(self, units, use_bias, **kwargs): + super(Linear, self).__init__(**kwargs) + self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer') + self.activation = keras.layers.ReLU() + + def call(self, x, training=None): + """ + shapes: + x: B x T x C + """ + return self.activation(self.linear_layer(x)) + + +class LinearBN(keras.layers.Layer): + def __init__(self, units, use_bias, **kwargs): + super(LinearBN, self).__init__(**kwargs) + self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer') + self.batch_normalization = keras.layers.BatchNormalization(axis=-1, momentum=0.90, epsilon=1e-5, name='batch_normalization') + self.activation = keras.layers.ReLU() + + def call(self, x, training=None): + """ + shapes: + x: B x T x C + """ + out = self.linear_layer(x) + out = self.batch_normalization(out, training=training) + return self.activation(out) + + +class Prenet(keras.layers.Layer): + def __init__(self, + prenet_type, + prenet_dropout, + units, + bias, + **kwargs): + super(Prenet, self).__init__(**kwargs) + self.prenet_type = prenet_type + self.prenet_dropout = prenet_dropout + self.linear_layers = [] + if prenet_type == "bn": + self.linear_layers += [LinearBN(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)] + elif prenet_type == "original": + self.linear_layers += [Linear(unit, use_bias=bias, name=f'linear_layer_{idx}') for idx, unit in enumerate(units)] + else: + raise RuntimeError(' [!] Unknown prenet type.') + if prenet_dropout: + self.dropout = keras.layers.Dropout(rate=0.5) + + def call(self, x, training=None): + """ + shapes: + x: B x T x C + """ + for linear in self.linear_layers: + if self.prenet_dropout: + x = self.dropout(linear(x), training=training) + else: + x = linear(x) + return x + + +def _sigmoid_norm(score): + attn_weights = tf.nn.sigmoid(score) + attn_weights = attn_weights / tf.reduce_sum(attn_weights, axis=1, keepdims=True) + return attn_weights + + +class Attention(keras.layers.Layer): + """TODO: implement forward_attention""" + """TODO: location sensitive attention""" + """TODO: implement attention windowing """ + def __init__(self, attn_dim, use_loc_attn, loc_attn_n_filters, + loc_attn_kernel_size, use_windowing, norm, use_forward_attn, + use_trans_agent, use_forward_attn_mask, **kwargs): + super(Attention, self).__init__(**kwargs) + self.use_loc_attn = use_loc_attn + self.loc_attn_n_filters = loc_attn_n_filters + self.loc_attn_kernel_size = loc_attn_kernel_size + self.use_windowing = use_windowing + self.norm = norm + self.use_forward_attn = use_forward_attn + self.use_trans_agent = use_trans_agent + self.use_forward_attn_mask = use_forward_attn_mask + self.query_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name='query_layer/linear_layer') + self.inputs_layer = tf.keras.layers.Dense(attn_dim, use_bias=False, name=f'{self.name}/inputs_layer/linear_layer') + self.v = tf.keras.layers.Dense(1, use_bias=True, name='v/linear_layer') + if use_loc_attn: + self.location_conv1d = keras.layers.Conv1D( + filters=loc_attn_n_filters, + kernel_size=loc_attn_kernel_size, + padding='same', + use_bias=False, + name='location_layer/location_conv1d') + self.location_dense = keras.layers.Dense(attn_dim, use_bias=False, name='location_layer/location_dense') + if norm == 'softmax': + self.norm_func = tf.nn.softmax + elif norm == 'sigmoid': + self.norm_func = _sigmoid_norm + else: + raise ValueError("Unknown value for attention norm type") + + def init_states(self, batch_size, value_length): + states = () + if self.use_loc_attn: + attention_cum = tf.zeros([batch_size, value_length]) + attention_old = tf.zeros([batch_size, value_length]) + states = (attention_cum, attention_old) + return states + + def process_values(self, values): + """ cache values for decoder iterations """ + self.processed_values = self.inputs_layer(values) + self.values = values + + def get_loc_attn(self, query, states): + """ compute location attention, query layer and + unnorm. attention weights""" + attention_cum, attention_old = states + attn_cat = tf.stack([attention_old, attention_cum], + axis=2) + + processed_query = self.query_layer(tf.expand_dims(query, 1)) + processed_attn = self.location_dense(self.location_conv1d(attn_cat)) + score = self.v( + tf.nn.tanh(self.processed_values + processed_query + + processed_attn)) + score = tf.squeeze(score, axis=2) + return score, processed_query + + def get_attn(self, query): + """ compute query layer and unnormalized attention weights """ + processed_query = self.query_layer(tf.expand_dims(query, 1)) + score = self.v(tf.nn.tanh(self.processed_values + processed_query)) + score = tf.squeeze(score, axis=2) + return score, processed_query + + def apply_score_masking(self, score, mask): + """ ignore sequence paddings """ + padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2) + # Bias so padding positions do not contribute to attention distribution. + score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32) + return score + + def call(self, query, states): + """ + shapes: + query: B x D + """ + if self.use_loc_attn: + score, processed_query = self.get_loc_attn(query, states) + else: + score, processed_query = self.get_attn(query) + + # TODO: masking + # if mask is not None: + # self.apply_score_masking(score, mask) + # attn_weights shape == (batch_size, max_length, 1) + + attn_weights = self.norm_func(score) + + # update attention states + if self.use_loc_attn: + states = (states[0] + attn_weights, attn_weights) + else: + states = () + + # context_vector shape after sum == (batch_size, hidden_size) + context_vector = tf.matmul(tf.expand_dims(attn_weights, axis=2), self.values, transpose_a=True, transpose_b=False) + context_vector = tf.squeeze(context_vector, axis=1) + return context_vector, attn_weights, states + + +# def _location_sensitive_score(processed_query, keys, processed_loc, attention_v, attention_b): +# dtype = processed_query.dtype +# num_units = keys.shape[-1].value or array_ops.shape(keys)[-1] +# return tf.reduce_sum(attention_v * tf.tanh(keys + processed_query + processed_loc + attention_b), [2]) + + +# class LocationSensitiveAttention(BahdanauAttention): +# def __init__(self, +# units, +# memory=None, +# memory_sequence_length=None, +# normalize=False, +# probability_fn="softmax", +# kernel_initializer="glorot_uniform", +# dtype=None, +# name="LocationSensitiveAttention", +# location_attention_filters=32, +# location_attention_kernel_size=31): + +# super(LocationSensitiveAttention, +# self).__init__(units=units, +# memory=memory, +# memory_sequence_length=memory_sequence_length, +# normalize=normalize, +# probability_fn='softmax', ## parent module default +# kernel_initializer=kernel_initializer, +# dtype=dtype, +# name=name) +# if probability_fn == 'sigmoid': +# self.probability_fn = lambda score, _: self._sigmoid_normalization(score) +# self.location_conv = keras.layers.Conv1D(filters=location_attention_filters, kernel_size=location_attention_kernel_size, padding='same', use_bias=False) +# self.location_dense = keras.layers.Dense(units, use_bias=False) +# # self.v = keras.layers.Dense(1, use_bias=True) + +# def _location_sensitive_score(self, processed_query, keys, processed_loc): +# processed_query = tf.expand_dims(processed_query, 1) +# return tf.reduce_sum(self.attention_v * tf.tanh(keys + processed_query + processed_loc), [2]) + +# def _location_sensitive(self, alignment_cum, alignment_old): +# alignment_cat = tf.stack([alignment_cum, alignment_old], axis=2) +# return self.location_dense(self.location_conv(alignment_cat)) + +# def _sigmoid_normalization(self, score): +# return tf.nn.sigmoid(score) / tf.reduce_sum(tf.nn.sigmoid(score), axis=-1, keepdims=True) + +# # def _apply_masking(self, score, mask): +# # padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2) +# # # Bias so padding positions do not contribute to attention distribution. +# # score -= 1.e9 * math_ops.cast(padding_mask, dtype=tf.float32) +# # return score + +# def _calculate_attention(self, query, state): +# alignment_cum, alignment_old = state[:2] +# processed_query = self.query_layer( +# query) if self.query_layer else query +# processed_loc = self._location_sensitive(alignment_cum, alignment_old) +# score = self._location_sensitive_score( +# processed_query, +# self.keys, +# processed_loc) +# alignment = self.probability_fn(score, state) +# alignment_cum = alignment_cum + alignment +# state[0] = alignment_cum +# state[1] = alignment +# return alignment, state + +# def compute_context(self, alignments): +# expanded_alignments = tf.expand_dims(alignments, 1) +# context = tf.matmul(expanded_alignments, self.values) +# context = tf.squeeze(context, [1]) +# return context + +# # def call(self, query, state): +# # alignment, next_state = self._calculate_attention(query, state) +# # return alignment, next_state diff --git a/tf/layers/tacotron2.py b/tf/layers/tacotron2.py new file mode 100644 index 00000000..4d787e83 --- /dev/null +++ b/tf/layers/tacotron2.py @@ -0,0 +1,231 @@ + +import tensorflow as tf +from tensorflow import keras +from TTS.tf.utils.tf_utils import shape_list +from TTS.tf.layers.common_layers import Prenet, Attention +# from tensorflow_addons.seq2seq import AttentionWrapper + + +class ConvBNBlock(keras.layers.Layer): + def __init__(self, filters, kernel_size, activation, **kwargs): + super(ConvBNBlock, self).__init__(**kwargs) + self.convolution1d = keras.layers.Conv1D(filters, kernel_size, padding='same', name='convolution1d') + self.batch_normalization = keras.layers.BatchNormalization(axis=2, momentum=0.90, epsilon=1e-5, name='batch_normalization') + self.dropout = keras.layers.Dropout(rate=0.5, name='dropout') + self.activation = keras.layers.Activation(activation, name='activation') + + def call(self, x, training=None): + o = self.convolution1d(x) + o = self.batch_normalization(o, training=training) + o = self.activation(o) + o = self.dropout(o, training=training) + return o + + +class Postnet(keras.layers.Layer): + def __init__(self, output_filters, num_convs, **kwargs): + super(Postnet, self).__init__(**kwargs) + self.convolutions = [] + self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name='convolutions_0')) + for idx in range(1, num_convs - 1): + self.convolutions.append(ConvBNBlock(512, 5, 'tanh', name=f'convolutions_{idx}')) + self.convolutions.append(ConvBNBlock(output_filters, 5, 'linear', name=f'convolutions_{idx+1}')) + + def call(self, x, training=None): + o = x + for layer in self.convolutions: + o = layer(o, training=training) + return o + + +class Encoder(keras.layers.Layer): + def __init__(self, output_input_dim, **kwargs): + super(Encoder, self).__init__(**kwargs) + self.convolutions = [] + for idx in range(3): + self.convolutions.append(ConvBNBlock(output_input_dim, 5, 'relu', name=f'convolutions_{idx}')) + self.lstm = keras.layers.Bidirectional(keras.layers.LSTM(output_input_dim // 2, return_sequences=True, use_bias=True), name='lstm') + + def call(self, x, training=None): + o = x + for layer in self.convolutions: + o = layer(o, training=training) + o = self.lstm(o) + return o + + +class Decoder(keras.layers.Layer): + def __init__(self, frame_dim, r, attn_type, use_attn_win, attn_norm, prenet_type, + prenet_dropout, use_forward_attn, use_trans_agent, use_forward_attn_mask, + use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, **kwargs): + super(Decoder, self).__init__(**kwargs) + self.frame_dim = frame_dim + self.r_init = tf.constant(r, dtype=tf.int32) + self.r = tf.constant(r, dtype=tf.int32) + self.separate_stopnet = separate_stopnet + self.max_decoder_steps = tf.constant(1000, dtype=tf.int32) + self.stop_thresh = tf.constant(0.5, dtype=tf.float32) + + # model dimensions + self.query_dim = 1024 + self.decoder_rnn_dim = 1024 + self.prenet_dim = 256 + self.attn_dim = 128 + self.p_attention_dropout = 0.1 + self.p_decoder_dropout = 0.1 + + self.prenet = Prenet(prenet_type, + prenet_dropout, + [self.prenet_dim, self.prenet_dim], + bias=False, + name='prenet') + self.attention_rnn = keras.layers.LSTMCell(self.query_dim, use_bias=True, name=f'{self.name}/attention_rnn', ) + self.attention_rnn_dropout = keras.layers.Dropout(0.5) + + # TODO: implement other attn options + self.attention = Attention(attn_dim=self.attn_dim, + use_loc_attn=True, + loc_attn_n_filters=32, + loc_attn_kernel_size=31, + use_windowing=False, + norm=attn_norm, + use_forward_attn=use_forward_attn, + use_trans_agent=use_trans_agent, + use_forward_attn_mask=use_forward_attn_mask, + name='attention') + self.decoder_rnn = keras.layers.LSTMCell(self.decoder_rnn_dim, use_bias=True, name=f'{self.name}/decoder_rnn') + self.decoder_rnn_dropout = keras.layers.Dropout(0.5) + self.linear_projection = keras.layers.Dense(self.frame_dim * r, name=f'{self.name}/linear_projection/linear_layer') + self.stopnet = keras.layers.Dense(1, name=f'{self.name}/stopnet/linear_layer') + + + def set_max_decoder_steps(self, new_max_steps): + self.max_decoder_steps = tf.constant(new_max_steps, dtype=tf.int32) + + def set_r(self, new_r): + self.r = tf.constant(new_r, dtype=tf.int32) + + def build_decoder_initial_states(self, batch_size, memory_dim, memory_length): + zero_frame = tf.zeros([batch_size, self.frame_dim]) + zero_context = tf.zeros([batch_size, memory_dim]) + attention_rnn_state = self.attention_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32) + decoder_rnn_state = self.decoder_rnn.get_initial_state(batch_size=batch_size, dtype=tf.float32) + attention_states = self.attention.init_states(batch_size, memory_length) + return zero_frame, zero_context, attention_rnn_state, decoder_rnn_state, attention_states + + def step(self, prenet_next, states, + memory_seq_length=None, training=None): + _, context_next, attention_rnn_state, decoder_rnn_state, attention_states = states + attention_rnn_input = tf.concat([prenet_next, context_next], -1) + attention_rnn_output, attention_rnn_state = \ + self.attention_rnn(attention_rnn_input, + attention_rnn_state, training=training) + attention_rnn_output = self.attention_rnn_dropout(attention_rnn_output, training=training) + context, attention, attention_states = self.attention(attention_rnn_output, attention_states, training=training) + decoder_rnn_input = tf.concat([attention_rnn_output, context], -1) + decoder_rnn_output, decoder_rnn_state = \ + self.decoder_rnn(decoder_rnn_input, decoder_rnn_state, training=training) + decoder_rnn_output = self.decoder_rnn_dropout(decoder_rnn_output, training=training) + linear_projection_input = tf.concat([decoder_rnn_output, context], -1) + output_frame = self.linear_projection(linear_projection_input, training=training) + stopnet_input = tf.concat([decoder_rnn_output, output_frame], -1) + stopnet_output = self.stopnet(stopnet_input, training=training) + output_frame = output_frame[:, :self.r * self.frame_dim] + states = (output_frame[:, self.frame_dim * (self.r - 1):], context, attention_rnn_state, decoder_rnn_state, attention_states) + return output_frame, stopnet_output, states, attention + + def decode(self, memory, states, frames, memory_seq_length=None): + B, T, D = shape_list(memory) + num_iter = shape_list(frames)[1] // self.r + # init states + frame_zero = tf.expand_dims(states[0], 1) + frames = tf.concat([frame_zero, frames], axis=1) + outputs = tf.TensorArray(dtype=tf.float32, size=num_iter) + attentions = tf.TensorArray(dtype=tf.float32, size=num_iter) + stop_tokens = tf.TensorArray(dtype=tf.float32, size=num_iter) + # pre-computes + self.attention.process_values(memory) + prenet_output = self.prenet(frames, training=True) + step_count = tf.constant(0, dtype=tf.int32) + + def _body(step, memory, prenet_output, states, outputs, stop_tokens, attentions): + prenet_next = prenet_output[:, step] + output, stop_token, states, attention = self.step(prenet_next, + states, + memory_seq_length) + outputs = outputs.write(step, output) + attentions = attentions.write(step, attention) + stop_tokens = stop_tokens.write(step, stop_token) + return step + 1, memory, prenet_output, states, outputs, stop_tokens, attentions + _, memory, _, states, outputs, stop_tokens, attentions = \ + tf.while_loop(lambda *arg: True, + _body, + loop_vars=(step_count, memory, prenet_output, states, outputs, + stop_tokens, attentions), + parallel_iterations=32, + swap_memory=True, + maximum_iterations=num_iter) + + outputs = outputs.stack() + attentions = attentions.stack() + stop_tokens = stop_tokens.stack() + outputs = tf.transpose(outputs, [1, 0, 2]) + attentions = tf.transpose(attentions, [1, 0 ,2]) + stop_tokens = tf.transpose(stop_tokens, [1, 0, 2]) + stop_tokens = tf.squeeze(stop_tokens, axis=2) + outputs = tf.reshape(outputs, [B, -1, self.frame_dim]) + return outputs, stop_tokens, attentions + + def decode_inference(self, memory, states): + B, T, D = shape_list(memory) + # init states + outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) + attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) + stop_tokens = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) + # pre-computes + self.attention.process_values(memory) + + # iter vars + stop_flag = tf.constant(False, dtype=tf.bool) + step_count = tf.constant(0, dtype=tf.int32) + + def _body(step, memory, states, outputs, stop_tokens, attentions, stop_flag): + frame_next = states[0] + prenet_next = self.prenet(frame_next, training=False) + output, stop_token, states, attention = self.step(prenet_next, + states, + None, + training=False) + stop_token = tf.math.sigmoid(stop_token) + outputs = outputs.write(step, output) + attentions = attentions.write(step, attention) + stop_tokens = stop_tokens.write(step, stop_token) + stop_flag = tf.greater(stop_token, self.stop_thresh) + stop_flag = tf.reduce_all(stop_flag) + return step + 1, memory, states, outputs, stop_tokens, attentions, stop_flag + + cond = lambda step, m, s, o, st, a, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool)) + _, memory, states, outputs, stop_tokens, attentions, stop_flag = \ + tf.while_loop(cond, + _body, + loop_vars=(step_count, memory, states, outputs, + stop_tokens, attentions, stop_flag), + parallel_iterations=32, + swap_memory=True, + maximum_iterations=self.max_decoder_steps) + + outputs = outputs.stack() + attentions = attentions.stack() + stop_tokens = stop_tokens.stack() + + outputs = tf.transpose(outputs, [1, 0, 2]) + attentions = tf.transpose(attentions, [1, 0, 2]) + stop_tokens = tf.transpose(stop_tokens, [1, 0, 2]) + stop_tokens = tf.squeeze(stop_tokens, axis=2) + outputs = tf.reshape(outputs, [B, -1, self.frame_dim]) + return outputs, stop_tokens, attentions + + def call(self, memory, states, frames=None, memory_seq_length=None, training=False): + if training: + return self.decode(memory, states, frames, memory_seq_length) + return self.decode_inference(memory, states) \ No newline at end of file diff --git a/tf/models/tacotron2.py b/tf/models/tacotron2.py new file mode 100644 index 00000000..8ddee666 --- /dev/null +++ b/tf/models/tacotron2.py @@ -0,0 +1,72 @@ +import tensorflow as tf +from tensorflow import keras + +from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet +from TTS.tf.utils.tf_utils import shape_list + + +class Tacotron2(keras.models.Model): + def __init__(self, + num_chars, + num_speakers, + r, + postnet_output_dim=80, + decoder_output_dim=80, + attn_type='original', + attn_win=False, + attn_norm="softmax", + attn_K=4, + prenet_type="original", + prenet_dropout=True, + forward_attn=False, + trans_agent=False, + forward_attn_mask=False, + location_attn=True, + separate_stopnet=True, + bidirectional_decoder=False): + super(Tacotron2, self).__init__() + self.r = r + self.decoder_output_dim = decoder_output_dim + self.postnet_output_dim = postnet_output_dim + self.bidirectional_decoder = bidirectional_decoder + self.num_speakers = num_speakers + self.speaker_embed_dim = 256 + + self.embedding = keras.layers.Embedding(num_chars, 512, name='embedding') + self.encoder = Encoder(512, name='encoder') + # TODO: most of the decoder args have no use at the momment + self.decoder = Decoder(decoder_output_dim, r, attn_type=attn_type, use_attn_win=attn_win, attn_norm=attn_norm, prenet_type=prenet_type, + prenet_dropout=prenet_dropout, use_forward_attn=forward_attn, use_trans_agent=trans_agent, use_forward_attn_mask=forward_attn_mask, + use_location_attn=location_attn, attn_K=attn_K, separate_stopnet=separate_stopnet, speaker_emb_dim=self.speaker_embed_dim) + self.postnet = Postnet(postnet_output_dim, 5, name='postnet') + + def call(self, characters, text_lengths=None, frames=None, training=None): + if training == True: + return self.training(characters, text_lengths, frames) + else: + return self.inference(characters) + + def training(self, characters, text_lengths, frames): + B, T = shape_list(characters) + embedding_vectors = self.embedding(characters, training=True) + encoder_output = self.encoder(embedding_vectors, training=True) + decoder_states = self.decoder.build_decoder_initial_states(B, 512, T) + decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, frames, text_lengths, training=True) + postnet_frames = self.postnet(decoder_frames, training=True) + output_frames = decoder_frames + postnet_frames + return decoder_frames, output_frames, attentions, stop_tokens + + def inference(self, characters): + B, T = shape_list(characters) + embedding_vectors = self.embedding(characters, training=False) + encoder_output = self.encoder(embedding_vectors, training=False) + decoder_states = self.decoder.build_decoder_initial_states(B, 512, T) + decoder_frames, stop_tokens, attentions = self.decoder(encoder_output, decoder_states, training=False) + postnet_frames = self.postnet(decoder_frames, training=False) + output_frames = decoder_frames + postnet_frames + print(output_frames.shape) + return decoder_frames, output_frames, attentions, stop_tokens + + + + diff --git a/tf/notebooks/Benchmark-TTS_tf.ipynb b/tf/notebooks/Benchmark-TTS_tf.ipynb new file mode 100644 index 00000000..5531460e --- /dev/null +++ b/tf/notebooks/Benchmark-TTS_tf.ipynb @@ -0,0 +1,708 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "This is to test TTS models with benchmark sentences for speech synthesis.\n", + "\n", + "Before running this script please DON'T FORGET: \n", + "- to set file paths.\n", + "- to download related model files from TTS and PWGAN.\n", + "- download or clone related repos, linked below.\n", + "- setup the repositories. ```python setup.py install```\n", + "- to checkout right commit versions (given next to the model) of TTS and PWGAN.\n", + "- to set the right paths in the cell below.\n", + "\n", + "Repositories:\n", + "- TTS: https://github.com/mozilla/TTS\n", + "- PWGAN: https://github.com/erogol/ParallelWaveGAN" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "scrolled": true + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import os\n", + "\n", + "# you may need to change this depending on your system\n", + "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", + "\n", + "import sys\n", + "import io\n", + "import torch \n", + "import tensorflow as tf\n", + "print(tf.config.list_physical_devices('GPU'))\n", + "\n", + "import time\n", + "import json\n", + "import yaml\n", + "import numpy as np\n", + "from collections import OrderedDict\n", + "import matplotlib.pyplot as plt\n", + "plt.rcParams[\"figure.figsize\"] = (16,5)\n", + "\n", + "import librosa\n", + "import librosa.display\n", + "\n", + "from TTS.tf.models.tacotron2 import Tacotron2\n", + "from TTS.tf.utils.generic_utils import setup_model, load_checkpoint\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.io import load_config\n", + "from TTS.utils.synthesis import synthesis\n", + "from TTS.utils.visual import visualize\n", + "\n", + "import IPython\n", + "from IPython.display import Audio\n", + "\n", + "%matplotlib agg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", + " t_1 = time.time()\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, None, None, False, CONFIG.enable_eos_bos_chars, use_gl, backend=BACKEND)\n", + " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " # coorect the normalization differences b/w TTS and the Vocoder.\n", + " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " print(mel_postnet_spec.shape)\n", + " print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " mel_postnet_spec = ap._denormalize(mel_postnet_spec.T).T\n", + " if use_cuda and not use_gl:\n", + " waveform = waveform.cpu()\n", + " waveform = waveform.numpy()\n", + " waveform = waveform.squeeze()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " if figures: \n", + " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec.T).T) \n", + " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=True)) \n", + " os.makedirs(OUT_FOLDER, exist_ok=True)\n", + " file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", + " out_path = os.path.join(OUT_FOLDER, file_name)\n", + " ap.save_wav(waveform, out_path)\n", + " return alignment, mel_postnet_spec, stop_tokens, waveform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# Set constants\n", + "ROOT_PATH = '../tf_model/'\n", + "MODEL_PATH = ROOT_PATH + '/tts_tf_checkpoint_360000.pkl'\n", + "CONFIG_PATH = ROOT_PATH + '/config.json'\n", + "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", + "CONFIG = load_config(CONFIG_PATH)\n", + "# Run FLAGs\n", + "use_cuda = True\n", + "# Set the vocoder\n", + "use_gl = True # use GL if True\n", + "BACKEND = 'tf'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false", + "scrolled": true + }, + "outputs": [], + "source": [ + "from TTS.utils.text.symbols import symbols, phonemes, make_symbols\n", + "from TTS.tf.utils.convert_torch_to_tf_utils import tf_create_dummy_inputs\n", + "c = CONFIG\n", + "num_speakers = 0\n", + "r = 1\n", + "num_chars = len(phonemes) if c.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, num_speakers, c)\n", + "\n", + "# before loading weights you need to run the model once to generate the variables\n", + "input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs()\n", + "mel_pred = model(input_ids, training=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "model = load_checkpoint(model, MODEL_PATH)\n", + "# model = tf.function(model, experimental_relax_shapes=True)\n", + "ap = AudioProcessor(**CONFIG.audio) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# wrapper class to use tf.function\n", + "class ModelInference(tf.keras.Model):\n", + " def __init__(self, model):\n", + " super(ModelInference, self).__init__()\n", + " self.model = model\n", + " \n", + " @tf.function(input_signature=[tf.TensorSpec(shape=(None, None), dtype=tf.int32)])\n", + " def call(self, characters):\n", + " return self.model(characters, training=False)\n", + " \n", + "model = ModelInference(model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# LOAD WAVERNN\n", + "if use_gl == False:\n", + " from parallel_wavegan.models import ParallelWaveGANGenerator, MelGANGenerator\n", + " \n", + " vocoder_model = MelGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n", + " vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n", + " vocoder_model.remove_weight_norm()\n", + " ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) \n", + " if use_cuda:\n", + " vocoder_model.cuda()\n", + " vocoder_model.eval();\n", + " print(count_parameters(vocoder_model))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Comparision with https://mycroft.ai/blog/available-voices/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### https://espnet.github.io/icassp2020-tts/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"The Commission also recommends\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Other examples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"The human voice is the most perfect instrument of all.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"This cake is great. It's so delicious and moist.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Comparison with https://keithito.github.io/audio-samples/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \" He has read the whole thing.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"He reads books.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Thisss isrealy awhsome.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"This is your internet browser, Firefox.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"This is your internet browser Firefox.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Eren, how are you?\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Hard Sentences" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Encouraged, he started with a minute a day.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"If he decided to watch TV he really watched it.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "# for twb dataset\n", + "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", + "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "wavs = []\n", + "model.eval()\n", + "model.decoder.prenet.eval()\n", + "model.decoder.max_decoder_steps = 2000\n", + "# model.decoder.prenet.train()\n", + "speaker_id = None\n", + "sentence = '''This is App Store Optimization report.\n", + "The first tab on the report is App Details. App details report is updated weekly and Datetime column shows the latest report update date. The widget displays the app icon, respective app version, visual assets on the store, app description, latest app update date on the Appstore/Google PlayStore and what’s new section.\n", + "In App Details tab, you can see not only your app but all Delivery Hero apps since we think it can be inspiring to see the other apps, their description and screenshots. \n", + "Product name is the actual app name on the AppStore or Google Play Store.\n", + "Screenshot URLs column display the actual screenshots on the store for the current version. No resizing is done. If you click on the screenshot, you can see it in full-size.\n", + "Current release date show the latest app update date when the query is run. Here we see that Appetito24 Android is updated to app version 4.6.3.2 on 28th of March.\n", + "If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n", + "If you scroll down in the widget, you can see the older app versions for the same apps. Or you can filter Datetime to see a specific timeframe and the apps’ Store presence back then.\n", + "You can also filter for a specific app using Product Name.\n", + "If the description is too long, clarisights is not able to display the full description; however, if you select description and current_release_date cells to copy and paste it to a text editor, you'll see the full description.\n", + "'''\n", + "\n", + "for s in sentence.split('\\n'):\n", + " print(s)\n", + " align, spec, stop_tokens, wav = tts(model, s, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)\n", + " wavs = np.concatenate([wavs, np.zeros(int(ap.sample_rate * 0.5)), wav])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tf/requirements b/tf/requirements new file mode 100644 index 00000000..75882a1d --- /dev/null +++ b/tf/requirements @@ -0,0 +1,2 @@ +fuzzywuzzy +tensorflow>=2.2.0 \ No newline at end of file diff --git a/tf/utils/convert_torch_to_tf_utils.py b/tf/utils/convert_torch_to_tf_utils.py new file mode 100644 index 00000000..732f2fb5 --- /dev/null +++ b/tf/utils/convert_torch_to_tf_utils.py @@ -0,0 +1,83 @@ +import numpy as np +import torch +import re +import tensorflow as tf +import tensorflow.keras.backend as K + + +def tf_create_dummy_inputs(): + """ Create dummy inputs for TF Tacotron2 model """ + batch_size = 4 + max_input_length = 32 + max_mel_length = 128 + pad = 1 + n_chars = 24 + input_ids = tf.random.uniform([batch_size, max_input_length + pad], maxval=n_chars, dtype=tf.int32) + input_lengths = np.random.randint(0, high=max_input_length+1 + pad, size=[batch_size]) + input_lengths[-1] = max_input_length + input_lengths = tf.convert_to_tensor(input_lengths, dtype=tf.int32) + mel_outputs = tf.random.uniform(shape=[batch_size, max_mel_length + pad, 80]) + mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size]) + mel_lengths[-1] = max_mel_length + mel_lengths = tf.convert_to_tensor(mel_lengths, dtype=tf.int32) + return input_ids, input_lengths, mel_outputs, mel_lengths + + +def compare_torch_tf(torch_tensor, tf_tensor): + """ Compute the average absolute difference b/w torch and tf tensors """ + return abs(torch_tensor.detach().numpy() - tf_tensor.numpy()).mean() + + +def convert_tf_name(tf_name): + """ Convert certain patterns in TF layer names to Torch patterns """ + tf_name_tmp = tf_name + tf_name_tmp = tf_name_tmp.replace(':0', '') + tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_1/recurrent_kernel', '/weight_hh_l0') + tf_name_tmp = tf_name_tmp.replace('/forward_lstm/lstm_cell_2/kernel', '/weight_ih_l1') + tf_name_tmp = tf_name_tmp.replace('/recurrent_kernel', '/weight_hh') + tf_name_tmp = tf_name_tmp.replace('/kernel', '/weight') + tf_name_tmp = tf_name_tmp.replace('/gamma', '/weight') + tf_name_tmp = tf_name_tmp.replace('/beta', '/bias') + tf_name_tmp = tf_name_tmp.replace('/', '.') + return tf_name_tmp + + +def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict): + """ Transfer weigths from torch state_dict to TF variables """ + print(" > Passing weights from Torch to TF ...") + for tf_var in tf_vars: + torch_var_name = var_map_dict[tf_var.name] + print(f' | > {tf_var.name} <-- {torch_var_name}') + # if tuple, it is a bias variable + if type(torch_var_name) is not tuple: + torch_layer_name = '.'.join(torch_var_name.split('.')[-2:]) + torch_weight = state_dict[torch_var_name] + if 'convolution1d/kernel' in tf_var.name or 'conv1d/kernel' in tf_var.name: + # out_dim, in_dim, filter -> filter, in_dim, out_dim + numpy_weight = torch_weight.permute([2, 1, 0]).detach().cpu().numpy() + elif 'lstm_cell' in tf_var.name and 'kernel' in tf_var.name: + numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy() + # if variable is for bidirectional lstm and it is a bias vector there + # needs to be pre-defined two matching torch bias vectors + elif '_lstm/lstm_cell_' in tf_var.name and 'bias' in tf_var.name: + bias_vectors = [value for key, value in state_dict.items() if key in torch_var_name] + assert len(bias_vectors) == 2 + numpy_weight = bias_vectors[0] + bias_vectors[1] + elif 'rnn' in tf_var.name and 'kernel' in tf_var.name: + numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy() + elif 'rnn' in tf_var.name and 'bias' in tf_var.name: + bias_vectors = [value for key, value in state_dict.items() if torch_var_name[:-2] in key] + assert len(bias_vectors) == 2 + numpy_weight = bias_vectors[0] + bias_vectors[1] + elif 'linear_layer' in torch_layer_name and 'weight' in torch_var_name: + numpy_weight = torch_weight.transpose(0, 1).detach().cpu().numpy() + else: + numpy_weight = torch_weight.detach().cpu().numpy() + assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}" + tf.keras.backend.set_value(tf_var, numpy_weight) + + +def load_tf_vars(model_tf, tf_vars): + for tf_var in tf_vars: + model_tf.get_layer(tf_var.name).set_weights(tf_var) + return model_tf diff --git a/tf/utils/generic_utils.py b/tf/utils/generic_utils.py new file mode 100644 index 00000000..3ef10a62 --- /dev/null +++ b/tf/utils/generic_utils.py @@ -0,0 +1,105 @@ +import os +import re +import glob +import shutil +import datetime +import json +import subprocess +import importlib +import pickle +import numpy as np +from collections import OrderedDict, Counter +import tensorflow as tf + + +def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **kwargs): + checkpoint_path = 'tts_tf_checkpoint_{}.pkl'.format(current_step) + checkpoint_path = os.path.join(output_folder, checkpoint_path) + state = { + 'model': model.weights, + 'optimizer': optimizer, + 'step': current_step, + 'epoch': epoch, + 'date': datetime.date.today().strftime("%B %d, %Y"), + 'r': r + } + state.update(kwargs) + pickle.dump(state, open(checkpoint_path, 'wb')) + + +def load_checkpoint(model, checkpoint_path): + checkpoint = pickle.load(open(checkpoint_path, 'rb')) + chkp_var_dict = dict([(var.name, var.numpy()) for var in checkpoint['model']]) + tf_vars = model.weights + for tf_var in tf_vars: + layer_name = tf_var.name + chkp_var_value = chkp_var_dict[layer_name] + tf.keras.backend.set_value(tf_var, chkp_var_value) + if 'r' in checkpoint.keys(): + model.decoder.set_r(checkpoint['r']) + return model + + +def sequence_mask(sequence_length, max_len=None): + if max_len is None: + max_len = sequence_length.max() + batch_size = sequence_length.size(0) + seq_range = np.empty([0, max_len], dtype=np.int8) + seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) + if sequence_length.is_cuda: + seq_range_expand = seq_range_expand.cuda() + seq_length_expand = ( + sequence_length.unsqueeze(1).expand_as(seq_range_expand)) + # B x T_max + return seq_range_expand < seq_length_expand + + +# @tf.custom_gradient +def check_gradient(x, grad_clip): + x_normed = tf.clip_by_norm(x, grad_clip) + grad_norm = tf.norm(grad_clip) + return x_normed, grad_norm + + +def count_parameters(model, c): + try: + return model.count_params() + except: + input_dummy = tf.convert_to_tensor(np.random.rand(8, 128).astype('int32')) + input_lengths = np.random.randint(100, 129, (8, )) + input_lengths[-1] = 128 + input_lengths = tf.convert_to_tensor(input_lengths.astype('int32')) + mel_spec = np.random.rand(8, 2 * c.r, + c.audio['num_mels']).astype('float32') + mel_spec = tf.convert_to_tensor(mel_spec) + speaker_ids = np.random.randint( + 0, 5, (8, )) if c.use_speaker_embedding else None + _ = model(input_dummy, input_lengths, mel_spec) + return model.count_params() + + +def setup_model(num_chars, num_speakers, c): + print(" > Using model: {}".format(c.model)) + MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower()) + MyModel = getattr(MyModel, c.model) + if c.model.lower() in "tacotron": + raise NotImplemented(' [!] Tacotron model is not ready.') + elif c.model.lower() == "tacotron2": + model = MyModel(num_chars=num_chars, + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=c.audio['num_mels'], + decoder_output_dim=c.audio['num_mels'], + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder) + return model diff --git a/tf/utils/tf_utils.py b/tf/utils/tf_utils.py new file mode 100644 index 00000000..558936d5 --- /dev/null +++ b/tf/utils/tf_utils.py @@ -0,0 +1,8 @@ +import tensorflow as tf + + +def shape_list(x): + """Deal with dynamic shape in tensorflow cleanly.""" + static = x.shape.as_list() + dynamic = tf.shape(x) + return [dynamic[i] if s is None else s for i, s in enumerate(static)] From 88053706450b3e964a972a209d3bc66270a9f7b6 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 18 May 2020 11:34:13 +0200 Subject: [PATCH 079/104] add tf tacotron2 test and edit test utils imports after utils refactoring --- tests/test_demo_server.py | 3 +- tests/test_loader.py | 2 +- tests/test_tacotron2_model.py | 2 +- tests/test_tacotron2_tf_model.py | 59 ++++++++++++++++++++++++++++++++ tests/test_tacotron_model.py | 2 +- tests/test_text_processing.py | 4 +-- 6 files changed, 66 insertions(+), 6 deletions(-) create mode 100644 tests/test_tacotron2_tf_model.py diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index a0837686..11d16a45 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -6,7 +6,8 @@ import torch as T from TTS.server.synthesizer import Synthesizer from TTS.tests import get_tests_input_path, get_tests_output_path from TTS.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.utils.generic_utils import load_config, save_checkpoint, setup_model +from TTS.utils.generic_utils import setup_model +from TTS.utils.io import load_config, save_checkpoint class DemoServerTest(unittest.TestCase): diff --git a/tests/test_loader.py b/tests/test_loader.py index 447c7b38..9edd233f 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -5,7 +5,7 @@ import torch import numpy as np from torch.utils.data import DataLoader -from TTS.utils.generic_utils import load_config +from TTS.utils.io import load_config from TTS.utils.audio import AudioProcessor from TTS.datasets import TTSDataset from TTS.datasets.preprocess import ljspeech diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index aa2869eb..eb91b3cc 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -6,7 +6,7 @@ import numpy as np from torch import optim from torch import nn -from TTS.utils.generic_utils import load_config +from TTS.utils.io import load_config from TTS.layers.losses import MSELossMasked from TTS.models.tacotron2 import Tacotron2 diff --git a/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py new file mode 100644 index 00000000..27398748 --- /dev/null +++ b/tests/test_tacotron2_tf_model.py @@ -0,0 +1,59 @@ +import os +import copy +import torch +import unittest +import numpy as np +import tensorflow as tf + +from torch import optim +from torch import nn +from TTS.utils.io import load_config +from TTS.layers.losses import MSELossMasked +from TTS.tf.models.tacotron2 import Tacotron2 + +#pylint: disable=unused-variable + +torch.manual_seed(1) +use_cuda = torch.cuda.is_available() +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + +file_path = os.path.dirname(os.path.realpath(__file__)) +c = load_config(os.path.join(file_path, 'test_config.json')) + + +class TacotronTFTrainTest(unittest.TestCase): + def test_train_step(self): + ''' test forward pass ''' + input = torch.randint(0, 24, (8, 128)).long().to(device) + input_lengths = torch.randint(100, 128, (8, )).long().to(device) + input_lengths = torch.sort(input_lengths, descending=True)[0] + mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) + mel_lengths = torch.randint(20, 30, (8, )).long().to(device) + stop_targets = torch.zeros(8, 30, 1).float().to(device) + speaker_ids = torch.randint(0, 5, (8, )).long().to(device) + + input = tf.convert_to_tensor(input.cpu().numpy()) + input_lengths = tf.convert_to_tensor(input_lengths.cpu().numpy()) + mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy()) + + for idx in mel_lengths: + stop_targets[:, int(idx.item()):, 0] = 1.0 + + stop_targets = stop_targets.view(input.shape[0], + stop_targets.size(1) // c.r, -1) + stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() + + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5) + # training pass + output = model(input, input_lengths, mel_spec, training=True) + + # check model output shapes + assert np.all(output[0].shape == mel_spec.shape) + assert np.all(output[1].shape == mel_spec.shape) + assert output[2].shape[2] == input.shape[1] + assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r) + assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r) + + # inference pass + output = model(input, training=False) diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index ac6712b0..7053a580 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -5,7 +5,7 @@ import unittest from torch import optim from torch import nn -from TTS.utils.generic_utils import load_config +from TTS.utils.io import load_config from TTS.layers.losses import L1LossMasked from TTS.models.tacotron import Tacotron diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 6c0c7058..93edabe7 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -5,7 +5,7 @@ import os import unittest from TTS.utils.text import * from TTS.tests import get_tests_path -from TTS.utils.generic_utils import load_config +from TTS.utils.io import load_config TESTS_PATH = get_tests_path() conf = load_config(os.path.join(TESTS_PATH, 'test_config.json')) @@ -92,4 +92,4 @@ def test_text2phone(): gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" lang = "en-us" ph = text2phone(text, lang) - assert gt == ph, f"\n{phonemes} \n vs \n{gt}" \ No newline at end of file + assert gt == ph, f"\n{phonemes} \n vs \n{gt}" From 523fa5dfd2c183ad030bbcbf4ea37f0a3ae82653 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 18 May 2020 11:35:19 +0200 Subject: [PATCH 080/104] pass sequence mask to the same device as the input --- utils/generic_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 9685f463..c81fde49 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -99,7 +99,7 @@ def sequence_mask(sequence_length, max_len=None): seq_range = torch.arange(0, max_len).long() seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) if sequence_length.is_cuda: - seq_range_expand = seq_range_expand.cuda() + seq_range_expand = seq_range_expand.to(sequence_length.device) seq_length_expand = ( sequence_length.unsqueeze(1).expand_as(seq_range_expand)) # B x T_max From 8e6aedccee7df04b405a70a6f190232e816b6c4c Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 18 May 2020 12:00:10 +0200 Subject: [PATCH 081/104] update readme --- tf/README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tf/README.md b/tf/README.md index 24e09a06..04b6936c 100644 --- a/tf/README.md +++ b/tf/README.md @@ -1,4 +1,12 @@ ## Utilities to Convert Models to Tensorflow2 -You can find some utilities to convert Torch models to Tensorflow with an experimental Tacotron2 implemenation in Tensorflow2 (>=2.2). However, our released Torch models may not work with this module due to additional changes layer naming convention. Therefore, you need to train new models to be converted to TF. +Here there are utilities to convert trained Torch models to Tensorflow (2.2>=). -This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own. \ No newline at end of file +We currently support Tacotron2 with Location Sensitive Attention. + +Be aware that our old Torch models may not work with this module due to additional changes in layer naming convention. Therefore, you need to train new models or handle these changes. + +We do not plan to share training scripts for Tensorflow in near future. But any contribution in that direction would be more than welcome. + +To see how you can use TF model at inference, check the notebook. + +This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own. From 342d6303d41a5a6752db8e91ebdc923ef3eebc95 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 18 May 2020 12:20:51 +0200 Subject: [PATCH 082/104] update TF model notebook --- tf/layers/tacotron2.py | 2 +- tf/notebooks/Benchmark-TTS_tf.ipynb | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tf/layers/tacotron2.py b/tf/layers/tacotron2.py index 4d787e83..b8e18cb1 100644 --- a/tf/layers/tacotron2.py +++ b/tf/layers/tacotron2.py @@ -228,4 +228,4 @@ class Decoder(keras.layers.Layer): def call(self, memory, states, frames=None, memory_seq_length=None, training=False): if training: return self.decode(memory, states, frames, memory_seq_length) - return self.decode_inference(memory, states) \ No newline at end of file + return self.decode_inference(memory, states) diff --git a/tf/notebooks/Benchmark-TTS_tf.ipynb b/tf/notebooks/Benchmark-TTS_tf.ipynb index 5531460e..c2b634e6 100644 --- a/tf/notebooks/Benchmark-TTS_tf.ipynb +++ b/tf/notebooks/Benchmark-TTS_tf.ipynb @@ -10,15 +10,14 @@ "\n", "Before running this script please DON'T FORGET: \n", "- to set file paths.\n", - "- to download related model files from TTS and PWGAN.\n", + "- to download related model files.\n", "- download or clone related repos, linked below.\n", "- setup the repositories. ```python setup.py install```\n", - "- to checkout right commit versions (given next to the model) of TTS and PWGAN.\n", - "- to set the right paths in the cell below.\n", + "- to checkout right commit versions (given next to the model in the models page).\n", + "- to set the file paths below.\n", "\n", "Repositories:\n", - "- TTS: https://github.com/mozilla/TTS\n", - "- PWGAN: https://github.com/erogol/ParallelWaveGAN" + "- TTS: https://github.com/mozilla/TTS" ] }, { @@ -151,7 +150,8 @@ "cell_type": "code", "execution_count": null, "metadata": { - "Collapsed": "false" + "Collapsed": "false", + "scrolled": true }, "outputs": [], "source": [ From 327c88b4bb983adc7db8f27fe87b367bd71e2565 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 18 May 2020 13:31:14 +0200 Subject: [PATCH 083/104] dme update --- tf/README.md | 18 +++++++++++++----- tf/requirements | 2 -- 2 files changed, 13 insertions(+), 7 deletions(-) delete mode 100644 tf/requirements diff --git a/tf/README.md b/tf/README.md index 04b6936c..0f9d58e9 100644 --- a/tf/README.md +++ b/tf/README.md @@ -1,12 +1,20 @@ ## Utilities to Convert Models to Tensorflow2 -Here there are utilities to convert trained Torch models to Tensorflow (2.2>=). +Here there are experimental utilities to convert trained Torch models to Tensorflow (2.2>=). -We currently support Tacotron2 with Location Sensitive Attention. +Converting Torch models to TF enables all the TF toolkit to be used for better deployment and device specific optimizations. -Be aware that our old Torch models may not work with this module due to additional changes in layer naming convention. Therefore, you need to train new models or handle these changes. - -We do not plan to share training scripts for Tensorflow in near future. But any contribution in that direction would be more than welcome. +Note that we do not plan to share training scripts for Tensorflow in near future. But any contribution in that direction would be more than welcome. To see how you can use TF model at inference, check the notebook. This is an experimental release. If you encounter an error, please put an issue or in the best send a PR but you are mostly on your own. + + +### Converting a Model +- Run ```convert_tacotron2_torch_to_tf.py --torch_model_path /path/to/torch/model.pth.tar --config_path /path/to/model/config.json --output_path /path/to/output/tf/model``` with the right arguments. + +### Known issues ans limitations +- We use a custom model load/save mechanism which enables us to store model related information with models weights. (Similar to Torch). However, it is prone to random errors. +- Current TF model implementation is slightly slower than Torch model. Hopefully, it'll get better with improving TF support for eager mode and ```tf.function```. +- TF implementation of Tacotron2 only supports regular Tacotron2 as in the paper. +- You can only convert models trained after TF model implementation since model layers has been updated in Torch model. diff --git a/tf/requirements b/tf/requirements deleted file mode 100644 index 75882a1d..00000000 --- a/tf/requirements +++ /dev/null @@ -1,2 +0,0 @@ -fuzzywuzzy -tensorflow>=2.2.0 \ No newline at end of file From 496ff68decb4245d3951bd2d6f93807e6f9d3fed Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 18 May 2020 18:45:30 +0200 Subject: [PATCH 084/104] config update --- config.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config.json b/config.json index c23bd004..ffc8fee5 100644 --- a/config.json +++ b/config.json @@ -118,11 +118,11 @@ "max_seq_len": 153, // DATASET-RELATED: maximum text length // PATHS - "output_path": "/home/erogol/Models/LJSpeech/", + "output_path": "/data/rw/home/Trainings/", // PHONEMES - "phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_cache_path": "/root/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages // MULTI-SPEAKER and GST @@ -135,7 +135,7 @@ [ { "name": "ljspeech", - "path": "/home/erogol/Data/LJSpeech-1.1/", + "path": "/root/LJSpeech-1.1/", "meta_file_train": "metadata.csv", "meta_file_val": null } From f75b0a64393b2f9721dec0ed922f9406d08e93f7 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 18 May 2020 18:46:13 +0200 Subject: [PATCH 085/104] linter updates --- distribute.py | 1 - layers/tacotron2.py | 6 +- server/synthesizer.py | 4 +- synthesize.py | 2 +- tests/test_tacotron2_tf_model.py | 32 +++++---- tf/convert_tacotron2_torch_to_tf.py | 98 +++++++++++++++------------ tf/layers/common_layers.py | 22 +++--- tf/layers/tacotron2.py | 31 +++++---- tf/models/tacotron2.py | 27 +++++--- tf/utils/convert_torch_to_tf_utils.py | 7 +- tf/utils/generic_utils.py | 50 ++++++-------- train.py | 7 +- utils/generic_utils.py | 12 ++-- utils/io.py | 12 ++-- utils/radam.py | 6 +- utils/synthesis.py | 18 ++--- utils/training.py | 5 +- 17 files changed, 177 insertions(+), 163 deletions(-) diff --git a/distribute.py b/distribute.py index 873d8aba..b0fc8b07 100644 --- a/distribute.py +++ b/distribute.py @@ -9,7 +9,6 @@ import torch.distributed as dist from torch.utils.data.sampler import Sampler from torch.autograd import Variable from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors -from TTS.utils.io import load_config from TTS.utils.generic_utils import create_experiment_folder diff --git a/layers/tacotron2.py b/layers/tacotron2.py index bdb169be..f11aee65 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -11,9 +11,9 @@ class ConvBNBlock(nn.Module): assert (kernel_size - 1) % 2 == 0 padding = (kernel_size - 1) // 2 self.convolution1d = nn.Conv1d(in_channels, - out_channels, - kernel_size, - padding=padding) + out_channels, + kernel_size, + padding=padding) self.batch_normalization = nn.BatchNorm1d(out_channels, momentum=0.1, eps=1e-5) self.dropout = nn.Dropout(p=0.5) if activation == 'relu': diff --git a/server/synthesizer.py b/server/synthesizer.py index 453e5827..c0069e33 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -171,12 +171,12 @@ class Synthesizer(object): speaker_id = id_to_torch(speaker_id) if speaker_id is not None and self.use_cuda: speaker_id = speaker_id.cuda() - + for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) # synthesize voice - decoder_output, postnet_output, alignments, _ = run_model( + decoder_output, postnet_output, alignments, _ = run_model_torch( self.tts_model, inputs, self.tts_config, False, speaker_id, None) # convert outputs to numpy postnet_output, decoder_output, _ = parse_outputs( diff --git a/synthesize.py b/synthesize.py index 1f1ce36f..1a760268 100644 --- a/synthesize.py +++ b/synthesize.py @@ -25,7 +25,7 @@ def tts(model, figures=False): t_1 = time.time() use_vocoder_model = vocoder_model is not None - waveform, alignment, _, postnet_output, stop_tokens = synthesis( + waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis( model, text, C, use_cuda, ap, speaker_id, style_wav=False, truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars, use_griffin_lim=(not use_vocoder_model), do_trim_silence=True) diff --git a/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py index 27398748..bc8f0407 100644 --- a/tests/test_tacotron2_tf_model.py +++ b/tests/test_tacotron2_tf_model.py @@ -1,14 +1,10 @@ import os -import copy import torch import unittest import numpy as np import tensorflow as tf -from torch import optim -from torch import nn from TTS.utils.io import load_config -from TTS.layers.losses import MSELossMasked from TTS.tf.models.tacotron2 import Tacotron2 #pylint: disable=unused-variable @@ -22,36 +18,44 @@ c = load_config(os.path.join(file_path, 'test_config.json')) class TacotronTFTrainTest(unittest.TestCase): - def test_train_step(self): - ''' test forward pass ''' - input = torch.randint(0, 24, (8, 128)).long().to(device) - input_lengths = torch.randint(100, 128, (8, )).long().to(device) - input_lengths = torch.sort(input_lengths, descending=True)[0] + + @staticmethod + def generate_dummy_inputs(): + chars_seq = torch.randint(0, 24, (8, 128)).long().to(device) + chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device) + chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0] mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_lengths = torch.randint(20, 30, (8, )).long().to(device) stop_targets = torch.zeros(8, 30, 1).float().to(device) speaker_ids = torch.randint(0, 5, (8, )).long().to(device) - input = tf.convert_to_tensor(input.cpu().numpy()) - input_lengths = tf.convert_to_tensor(input_lengths.cpu().numpy()) + chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy()) + chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy()) mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy()) + return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\ + stop_targets, speaker_ids + + def test_train_step(self): + ''' test forward pass ''' + chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\ + stop_targets, speaker_ids = self.generate_dummy_inputs() for idx in mel_lengths: stop_targets[:, int(idx.item()):, 0] = 1.0 - stop_targets = stop_targets.view(input.shape[0], + stop_targets = stop_targets.view(chars_seq.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() model = Tacotron2(num_chars=24, r=c.r, num_speakers=5) # training pass - output = model(input, input_lengths, mel_spec, training=True) + output = model(chars_seq, chars_seq_lengths, mel_spec, training=True) # check model output shapes assert np.all(output[0].shape == mel_spec.shape) assert np.all(output[1].shape == mel_spec.shape) - assert output[2].shape[2] == input.shape[1] + assert output[2].shape[2] == chars_seq.shape[1] assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r) assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r) diff --git a/tf/convert_tacotron2_torch_to_tf.py b/tf/convert_tacotron2_torch_to_tf.py index 512b0a4d..3b57782e 100644 --- a/tf/convert_tacotron2_torch_to_tf.py +++ b/tf/convert_tacotron2_torch_to_tf.py @@ -10,27 +10,23 @@ import torch import tensorflow as tf from fuzzywuzzy import fuzz -from TTS.utils.text.symbols import make_symbols, phonemes, symbols -from TTS.utils.generic_utils import setup_model, count_parameters +from TTS.utils.text.symbols import phonemes, symbols +from TTS.utils.generic_utils import setup_model from TTS.utils.io import load_config from TTS_tf.models.tacotron2 import Tacotron2 from TTS_tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name from TTS_tf.utils.generic_utils import save_checkpoint - parser = argparse.ArgumentParser() -parser.add_argument( - '--torch_model_path', - type=str, - help='Path to target torch model to be converted to TF.') -parser.add_argument( - '--config_path', - type=str, - help='Path to config file of torch model.') -parser.add_argument( - '--output_path', - type=str, - help='path to save TF model weights.') +parser.add_argument('--torch_model_path', + type=str, + help='Path to target torch model to be converted to TF.') +parser.add_argument('--config_path', + type=str, + help='Path to config file of torch model.') +parser.add_argument('--output_path', + type=str, + help='path to save TF model weights.') args = parser.parse_args() # load model config @@ -41,7 +37,8 @@ num_speakers = 0 # init torch model num_chars = len(phonemes) if c.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, c) -checkpoint = torch.load(args.torch_model_path, map_location=torch.device('cpu')) +checkpoint = torch.load(args.torch_model_path, + map_location=torch.device('cpu')) state_dict = checkpoint['model'] model.load_state_dict(state_dict) @@ -69,18 +66,24 @@ model_tf = Tacotron2(num_chars=num_chars, common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE' var_map = [ ('tacotron2/embedding/embeddings:0', 'embedding.weight'), - ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/kernel:0', 'encoder.lstm.weight_ih_l0'), - ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0', 'encoder.lstm.weight_hh_l0'), - ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/kernel:0', 'encoder.lstm.weight_ih_l0_reverse'), - ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0', 'encoder.lstm.weight_hh_l0_reverse'), - ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/bias:0', ('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')), - ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/bias:0', ('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')), + ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/kernel:0', + 'encoder.lstm.weight_ih_l0'), + ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0', + 'encoder.lstm.weight_hh_l0'), + ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/kernel:0', + 'encoder.lstm.weight_ih_l0_reverse'), + ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0', + 'encoder.lstm.weight_hh_l0_reverse'), + ('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/bias:0', + ('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')), + ('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/bias:0', + ('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')), ('attention/v/kernel:0', 'decoder.attention.v.linear_layer.weight'), - ('decoder/linear_projection/kernel:0', 'decoder.linear_projection.linear_layer.weight'), + ('decoder/linear_projection/kernel:0', + 'decoder.linear_projection.linear_layer.weight'), ('decoder/stopnet/kernel:0', 'decoder.stopnet.1.linear_layer.weight') ] - # %% # get tf_model graph input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs() @@ -95,15 +98,17 @@ tf_var_names = [we.name for we in model_tf.weights] for tf_name in tf_var_names: # skip re-mapped layer names if tf_name in [name[0] for name in var_map]: - continue + continue tf_name_edited = convert_tf_name(tf_name) - ratios = [fuzz.ratio(torch_name, tf_name_edited) for torch_name in torch_var_names] + ratios = [ + fuzz.ratio(torch_name, tf_name_edited) + for torch_name in torch_var_names + ] max_idx = np.argmax(ratios) matching_name = torch_var_names[max_idx] del torch_var_names[max_idx] var_map.append((tf_name, matching_name)) - # %% # print variable match from pprint import pprint @@ -121,20 +126,25 @@ input_ids = torch.randint(0, 24, (1, 128)).long() o_t = model.embedding(input_ids) o_tf = model_tf.embedding(input_ids.detach().numpy()) -assert abs(o_t.detach().numpy() - o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() - o_tf.numpy()).sum() +assert abs(o_t.detach().numpy() - + o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() - + o_tf.numpy()).sum() # compare encoder outputs -oo_en = model.encoder.inference(o_t.transpose(1,2)) +oo_en = model.encoder.inference(o_t.transpose(1, 2)) ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False) assert compare_torch_tf(oo_en, ooo_en) < 1e-5 +#pylint: disable=redefined-builtin # compare decoder.attention_rnn inp = torch.rand([1, 768]) inp_tf = inp.numpy() -model.decoder._init_states(oo_en, mask=None) +model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access output, cell_state = model.decoder.attention_rnn(inp) -states = model_tf.decoder.build_decoder_initial_states(1,512,128) -output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf, states[2], training=False) +states = model_tf.decoder.build_decoder_initial_states(1, 512, 128) +output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf, + states[2], + training=False) assert compare_torch_tf(output, output_tf).mean() < 1e-5 # compare decoder.attention @@ -145,7 +155,8 @@ inputs_tf = inputs.numpy() model.decoder.attention.init_states(inputs) processes_inputs = model.decoder.attention.preprocess_inputs(inputs) -loc_attn, proc_query = model.decoder.attention.get_location_attention(query, processes_inputs) +loc_attn, proc_query = model.decoder.attention.get_location_attention( + query, processes_inputs) context = model.decoder.attention(query, inputs, processes_inputs, None) model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf)) @@ -159,10 +170,13 @@ assert compare_torch_tf(context, context_tf) < 1e-5 # compare decoder.decoder_rnn input = torch.rand([1, 1536]) input_tf = input.numpy() -model.decoder._init_states(oo_en, mask=None) -output, cell_state = model.decoder.decoder_rnn(input, [model.decoder.decoder_hidden, model.decoder.decoder_cell]) -states = model_tf.decoder.build_decoder_initial_states(1,512,128) -output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf, states[3], training=False) +model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access +output, cell_state = model.decoder.decoder_rnn( + input, [model.decoder.decoder_hidden, model.decoder.decoder_cell]) +states = model_tf.decoder.build_decoder_initial_states(1, 512, 128) +output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf, + states[3], + training=False) assert abs(input - input_tf).mean() < 1e-5 assert compare_torch_tf(output, output_tf).mean() < 1e-5 @@ -177,15 +191,16 @@ assert compare_torch_tf(output, output_tf) < 1e-5 model.decoder.max_decoder_steps = 100 model_tf.decoder.set_max_decoder_steps(100) output, align, stop = model.decoder.inference(oo_en) -states = model_tf.decoder.build_decoder_initial_states(1,512,128) +states = model_tf.decoder.build_decoder_initial_states(1, 512, 128) output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False) -assert compare_torch_tf(output.transpose(1,2), output_tf) < 1e-4 +assert compare_torch_tf(output.transpose(1, 2), output_tf) < 1e-4 # compare the whole model output outputs_torch = model.inference(input_ids) outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy())) -print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean() ) -assert compare_torch_tf(outputs_torch[2][:, 50, :], outputs_tf[2][:, 50, :]) < 1e-5 +print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean()) +assert compare_torch_tf(outputs_torch[2][:, 50, :], + outputs_tf[2][:, 50, :]) < 1e-5 assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4 # %% @@ -193,4 +208,3 @@ assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4 save_checkpoint(model_tf, None, checkpoint['step'], checkpoint['epoch'], checkpoint['r'], args.output_path) print(' > Model conversion is successfully completed :).') - diff --git a/tf/layers/common_layers.py b/tf/layers/common_layers.py index fba06e0b..995b5490 100644 --- a/tf/layers/common_layers.py +++ b/tf/layers/common_layers.py @@ -3,8 +3,6 @@ from tensorflow import keras from tensorflow.python.ops import math_ops # from tensorflow_addons.seq2seq import BahdanauAttention -from TTS.tf.utils.tf_utils import shape_list - class Linear(keras.layers.Layer): def __init__(self, units, use_bias, **kwargs): @@ -12,7 +10,7 @@ class Linear(keras.layers.Layer): self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer') self.activation = keras.layers.ReLU() - def call(self, x, training=None): + def call(self, x): """ shapes: x: B x T x C @@ -77,9 +75,9 @@ def _sigmoid_norm(score): class Attention(keras.layers.Layer): - """TODO: implement forward_attention""" - """TODO: location sensitive attention""" - """TODO: implement attention windowing """ + """TODO: implement forward_attention + TODO: location sensitive attention + TODO: implement attention windowing """ def __init__(self, attn_dim, use_loc_attn, loc_attn_n_filters, loc_attn_kernel_size, use_windowing, norm, use_forward_attn, use_trans_agent, use_forward_attn_mask, **kwargs): @@ -120,6 +118,7 @@ class Attention(keras.layers.Layer): def process_values(self, values): """ cache values for decoder iterations """ + #pylint: disable=attribute-defined-outside-init self.processed_values = self.inputs_layer(values) self.values = values @@ -127,8 +126,7 @@ class Attention(keras.layers.Layer): """ compute location attention, query layer and unnorm. attention weights""" attention_cum, attention_old = states - attn_cat = tf.stack([attention_old, attention_cum], - axis=2) + attn_cat = tf.stack([attention_old, attention_cum], axis=2) processed_query = self.query_layer(tf.expand_dims(query, 1)) processed_attn = self.location_dense(self.location_conv1d(attn_cat)) @@ -145,7 +143,7 @@ class Attention(keras.layers.Layer): score = tf.squeeze(score, axis=2) return score, processed_query - def apply_score_masking(self, score, mask): + def apply_score_masking(self, score, mask): #pylint: disable=no-self-use """ ignore sequence paddings """ padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2) # Bias so padding positions do not contribute to attention distribution. @@ -158,13 +156,13 @@ class Attention(keras.layers.Layer): query: B x D """ if self.use_loc_attn: - score, processed_query = self.get_loc_attn(query, states) + score, _ = self.get_loc_attn(query, states) else: - score, processed_query = self.get_attn(query) + score, _ = self.get_attn(query) # TODO: masking # if mask is not None: - # self.apply_score_masking(score, mask) + # self.apply_score_masking(score, mask) # attn_weights shape == (batch_size, max_length, 1) attn_weights = self.norm_func(score) diff --git a/tf/layers/tacotron2.py b/tf/layers/tacotron2.py index b8e18cb1..c6f1a2cd 100644 --- a/tf/layers/tacotron2.py +++ b/tf/layers/tacotron2.py @@ -55,6 +55,7 @@ class Encoder(keras.layers.Layer): class Decoder(keras.layers.Layer): + #pylint: disable=unused-argument def __init__(self, frame_dim, r, attn_type, use_attn_win, attn_norm, prenet_type, prenet_dropout, use_forward_attn, use_trans_agent, use_forward_attn_mask, use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, **kwargs): @@ -135,7 +136,7 @@ class Decoder(keras.layers.Layer): return output_frame, stopnet_output, states, attention def decode(self, memory, states, frames, memory_seq_length=None): - B, T, D = shape_list(memory) + B, _, _ = shape_list(memory) num_iter = shape_list(frames)[1] // self.r # init states frame_zero = tf.expand_dims(states[0], 1) @@ -159,25 +160,25 @@ class Decoder(keras.layers.Layer): return step + 1, memory, prenet_output, states, outputs, stop_tokens, attentions _, memory, _, states, outputs, stop_tokens, attentions = \ tf.while_loop(lambda *arg: True, - _body, - loop_vars=(step_count, memory, prenet_output, states, outputs, - stop_tokens, attentions), - parallel_iterations=32, - swap_memory=True, - maximum_iterations=num_iter) + _body, + loop_vars=(step_count, memory, prenet_output, + states, outputs, stop_tokens, attentions), + parallel_iterations=32, + swap_memory=True, + maximum_iterations=num_iter) outputs = outputs.stack() attentions = attentions.stack() stop_tokens = stop_tokens.stack() outputs = tf.transpose(outputs, [1, 0, 2]) - attentions = tf.transpose(attentions, [1, 0 ,2]) + attentions = tf.transpose(attentions, [1, 0, 2]) stop_tokens = tf.transpose(stop_tokens, [1, 0, 2]) stop_tokens = tf.squeeze(stop_tokens, axis=2) outputs = tf.reshape(outputs, [B, -1, self.frame_dim]) return outputs, stop_tokens, attentions def decode_inference(self, memory, states): - B, T, D = shape_list(memory) + B, _, _ = shape_list(memory) # init states outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True) @@ -207,12 +208,12 @@ class Decoder(keras.layers.Layer): cond = lambda step, m, s, o, st, a, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool)) _, memory, states, outputs, stop_tokens, attentions, stop_flag = \ tf.while_loop(cond, - _body, - loop_vars=(step_count, memory, states, outputs, - stop_tokens, attentions, stop_flag), - parallel_iterations=32, - swap_memory=True, - maximum_iterations=self.max_decoder_steps) + _body, + loop_vars=(step_count, memory, states, outputs, + stop_tokens, attentions, stop_flag), + parallel_iterations=32, + swap_memory=True, + maximum_iterations=self.max_decoder_steps) outputs = outputs.stack() attentions = attentions.stack() diff --git a/tf/models/tacotron2.py b/tf/models/tacotron2.py index 8ddee666..101291cf 100644 --- a/tf/models/tacotron2.py +++ b/tf/models/tacotron2.py @@ -1,10 +1,10 @@ -import tensorflow as tf from tensorflow import keras from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet from TTS.tf.utils.tf_utils import shape_list +#pylint: disable=too-many-ancestors class Tacotron2(keras.models.Model): def __init__(self, num_chars, @@ -35,16 +35,28 @@ class Tacotron2(keras.models.Model): self.embedding = keras.layers.Embedding(num_chars, 512, name='embedding') self.encoder = Encoder(512, name='encoder') # TODO: most of the decoder args have no use at the momment - self.decoder = Decoder(decoder_output_dim, r, attn_type=attn_type, use_attn_win=attn_win, attn_norm=attn_norm, prenet_type=prenet_type, - prenet_dropout=prenet_dropout, use_forward_attn=forward_attn, use_trans_agent=trans_agent, use_forward_attn_mask=forward_attn_mask, - use_location_attn=location_attn, attn_K=attn_K, separate_stopnet=separate_stopnet, speaker_emb_dim=self.speaker_embed_dim) + self.decoder = Decoder(decoder_output_dim, + r, + attn_type=attn_type, + use_attn_win=attn_win, + attn_norm=attn_norm, + prenet_type=prenet_type, + prenet_dropout=prenet_dropout, + use_forward_attn=forward_attn, + use_trans_agent=trans_agent, + use_forward_attn_mask=forward_attn_mask, + use_location_attn=location_attn, + attn_K=attn_K, + separate_stopnet=separate_stopnet, + speaker_emb_dim=self.speaker_embed_dim) self.postnet = Postnet(postnet_output_dim, 5, name='postnet') def call(self, characters, text_lengths=None, frames=None, training=None): - if training == True: + if training: return self.training(characters, text_lengths, frames) - else: + if not training: return self.inference(characters) + raise RuntimeError(' [!] Set model training mode True or False') def training(self, characters, text_lengths, frames): B, T = shape_list(characters) @@ -67,6 +79,3 @@ class Tacotron2(keras.models.Model): print(output_frames.shape) return decoder_frames, output_frames, attentions, stop_tokens - - - diff --git a/tf/utils/convert_torch_to_tf_utils.py b/tf/utils/convert_torch_to_tf_utils.py index 732f2fb5..ba7e629b 100644 --- a/tf/utils/convert_torch_to_tf_utils.py +++ b/tf/utils/convert_torch_to_tf_utils.py @@ -1,8 +1,5 @@ import numpy as np -import torch -import re import tensorflow as tf -import tensorflow.keras.backend as K def tf_create_dummy_inputs(): @@ -17,7 +14,7 @@ def tf_create_dummy_inputs(): input_lengths[-1] = max_input_length input_lengths = tf.convert_to_tensor(input_lengths, dtype=tf.int32) mel_outputs = tf.random.uniform(shape=[batch_size, max_mel_length + pad, 80]) - mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size]) + mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size]) mel_lengths[-1] = max_mel_length mel_lengths = tf.convert_to_tensor(mel_lengths, dtype=tf.int32) return input_ids, input_lengths, mel_outputs, mel_lengths @@ -49,7 +46,7 @@ def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict): torch_var_name = var_map_dict[tf_var.name] print(f' | > {tf_var.name} <-- {torch_var_name}') # if tuple, it is a bias variable - if type(torch_var_name) is not tuple: + if not isinstance(torch_var_name, tuple): torch_layer_name = '.'.join(torch_var_name.split('.')[-2:]) torch_weight = state_dict[torch_var_name] if 'convolution1d/kernel' in tf_var.name or 'conv1d/kernel' in tf_var.name: diff --git a/tf/utils/generic_utils.py b/tf/utils/generic_utils.py index 3ef10a62..6368658d 100644 --- a/tf/utils/generic_utils.py +++ b/tf/utils/generic_utils.py @@ -1,14 +1,8 @@ import os -import re -import glob -import shutil import datetime -import json -import subprocess import importlib import pickle import numpy as np -from collections import OrderedDict, Counter import tensorflow as tf @@ -29,7 +23,7 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **k def load_checkpoint(model, checkpoint_path): checkpoint = pickle.load(open(checkpoint_path, 'rb')) - chkp_var_dict = dict([(var.name, var.numpy()) for var in checkpoint['model']]) + chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']} tf_vars = model.weights for tf_var in tf_vars: layer_name = tf_var.name @@ -64,7 +58,7 @@ def check_gradient(x, grad_clip): def count_parameters(model, c): try: return model.count_params() - except: + except RuntimeError: input_dummy = tf.convert_to_tensor(np.random.rand(8, 128).astype('int32')) input_lengths = np.random.randint(100, 129, (8, )) input_lengths[-1] = 128 @@ -74,7 +68,7 @@ def count_parameters(model, c): mel_spec = tf.convert_to_tensor(mel_spec) speaker_ids = np.random.randint( 0, 5, (8, )) if c.use_speaker_embedding else None - _ = model(input_dummy, input_lengths, mel_spec) + _ = model(input_dummy, input_lengths, mel_spec, speaker_ids=speaker_ids) return model.count_params() @@ -83,23 +77,23 @@ def setup_model(num_chars, num_speakers, c): MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower()) MyModel = getattr(MyModel, c.model) if c.model.lower() in "tacotron": - raise NotImplemented(' [!] Tacotron model is not ready.') - elif c.model.lower() == "tacotron2": - model = MyModel(num_chars=num_chars, - num_speakers=num_speakers, - r=c.r, - postnet_output_dim=c.audio['num_mels'], - decoder_output_dim=c.audio['num_mels'], - attn_type=c.attention_type, - attn_win=c.windowing, - attn_norm=c.attention_norm, - prenet_type=c.prenet_type, - prenet_dropout=c.prenet_dropout, - forward_attn=c.use_forward_attn, - trans_agent=c.transition_agent, - forward_attn_mask=c.forward_attn_mask, - location_attn=c.location_attn, - attn_K=c.attention_heads, - separate_stopnet=c.separate_stopnet, - bidirectional_decoder=c.bidirectional_decoder) + raise NotImplementedError(' [!] Tacotron model is not ready.') + # tacotron2 + model = MyModel(num_chars=num_chars, + num_speakers=num_speakers, + r=c.r, + postnet_output_dim=c.audio['num_mels'], + decoder_output_dim=c.audio['num_mels'], + attn_type=c.attention_type, + attn_win=c.windowing, + attn_norm=c.attention_norm, + prenet_type=c.prenet_type, + prenet_dropout=c.prenet_dropout, + forward_attn=c.use_forward_attn, + trans_agent=c.transition_agent, + forward_attn_mask=c.forward_attn_mask, + location_attn=c.location_attn, + attn_K=c.attention_heads, + separate_stopnet=c.separate_stopnet, + bidirectional_decoder=c.bidirectional_decoder) return model diff --git a/train.py b/train.py index b64b4f3e..8ff768f0 100644 --- a/train.py +++ b/train.py @@ -190,7 +190,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, # backward pass loss_dict['loss'].backward() optimizer, current_lr = adam_weight_decay(optimizer) - grad_norm, grad_flag = check_update(model, c.grad_clip, ignore_stopnet=True) + grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True) optimizer.step() # compute alignment error (the lower the better ) @@ -232,8 +232,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus) loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus) loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus) - loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, - num_gpus) if c.stopnet else loss_dict['stopnet_loss'] + loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss'] if args.rank == 0: # Plot Training Iter Stats @@ -308,8 +307,6 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, @torch.no_grad() def evaluate(model, criterion, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) - if c.use_speaker_embedding: - speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 eval_values_dict = { diff --git a/utils/generic_utils.py b/utils/generic_utils.py index c81fde49..1c7dd5e4 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -6,6 +6,7 @@ import datetime import subprocess import importlib import numpy as np +from collections import Counter def get_git_branch(): @@ -40,10 +41,10 @@ def get_commit_hash(): def create_experiment_folder(root_path, model_name, debug): """ Create a folder with the current date and time """ date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p") - # if debug: - # commit_hash = 'debug' - # else: - commit_hash = get_commit_hash() + if debug: + commit_hash = 'debug' + else: + commit_hash = get_commit_hash() output_folder = os.path.join( root_path, model_name + '-' + date_str + '-' + commit_hash) os.makedirs(output_folder, exist_ok=True) @@ -87,8 +88,7 @@ def split_dataset(items): items_eval.append(items[item_idx]) del items[item_idx] return items_eval, items - else: - return items[:eval_split_size], items[eval_split_size:] + return items[:eval_split_size], items[eval_split_size:] # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 diff --git a/utils/io.py b/utils/io.py index f6378336..faf00195 100644 --- a/utils/io.py +++ b/utils/io.py @@ -26,7 +26,7 @@ def copy_config_file(config_file, out_path, new_fields): config_lines = open(config_file, "r").readlines() # add extra information fields for key, value in new_fields.items(): - if type(value) == str: + if isinstance(value, str): new_line = '"{}":"{}",\n'.format(key, value) else: new_line = '"{}":{},\n'.format(key, value) @@ -37,7 +37,7 @@ def copy_config_file(config_file, out_path, new_fields): def load_checkpoint(model, checkpoint_path, use_cuda=False): - state = torch.load(checkpoint_path, map_location=torch.device('cpu')) + state = torch.load(checkpoint_path, map_location=torch.device('cpu')) model.load_state_dict(state['model']) if use_cuda: model.cuda() @@ -55,7 +55,7 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs): 'step': current_step, 'epoch': epoch, 'date': datetime.date.today().strftime("%B %d, %Y"), - 'r': model.decoder.r + 'r': r } state.update(kwargs) torch.save(state, output_path) @@ -65,7 +65,7 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **k file_name = 'checkpoint_{}.pth.tar'.format(current_step) checkpoint_path = os.path.join(output_folder, file_name) print(" > CHECKPOINT : {}".format(checkpoint_path)) - save_model(model, optimizer, current_step, epoch ,r, checkpoint_path, **kwargs) + save_model(model, optimizer, current_step, epoch, r, checkpoint_path, **kwargs) def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs): @@ -73,6 +73,6 @@ def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoc file_name = 'best_model.pth.tar' checkpoint_path = os.path.join(output_folder, file_name) print(" > BEST MODEL : {}".format(checkpoint_path)) - save_model(model, optimizer, current_step, epoch ,r, checkpoint_path, model_loss=target_loss) + save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs) best_loss = target_loss - return best_loss \ No newline at end of file + return best_loss diff --git a/utils/radam.py b/utils/radam.py index 738aac52..4724b705 100644 --- a/utils/radam.py +++ b/utils/radam.py @@ -8,9 +8,9 @@ from torch.optim.optimizer import Optimizer, required class RAdam(Optimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True): - if not 0.0 <= lr: + if lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: + if eps < 0.0: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) @@ -94,4 +94,4 @@ class RAdam(Optimizer): p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr']) p.data.copy_(p_data_fp32) - return loss \ No newline at end of file + return loss diff --git a/utils/synthesis.py b/utils/synthesis.py index 3903ba44..ef3c1105 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -1,5 +1,5 @@ import pkg_resources -installed = {pkg.key for pkg in pkg_resources.working_set} +installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable if 'tensorflow' in installed or 'tensorflow-gpu' in installed: import tensorflow as tf import torch @@ -7,7 +7,7 @@ import numpy as np from .text import text_to_sequence, phoneme_to_sequence -def text_to_seqvec(text, CONFIG, use_cuda): +def text_to_seqvec(text, CONFIG): text_cleaner = [CONFIG.text_cleaner] # text ot phonemes to sequence vector if CONFIG.use_phonemes: @@ -37,7 +37,7 @@ def numpy_to_tf(np_array, dtype): return tensor -def compute_style_mel(style_wav, ap, use_cuda): +def compute_style_mel(style_wav, ap): style_mel = ap.melspectrogram( ap.load_wav(style_wav)).expand_dims(0) return style_mel @@ -58,13 +58,13 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): - if CONFIG.use_gst: - raise NotImplemented(' [!] GST inference not implemented for TF') + if CONFIG.use_gst and style_mel is not None: + raise NotImplementedError(' [!] GST inference not implemented for TF') if truncated: - raise NotImplemented(' [!] Truncated inference not implemented for TF') + raise NotImplementedError(' [!] Truncated inference not implemented for TF') # TODO: handle multispeaker case decoder_output, postnet_output, alignments, stop_tokens = model( - inputs) + inputs, speaker_ids=speaker_id) return decoder_output, postnet_output, alignments, stop_tokens @@ -153,9 +153,9 @@ def synthesis(model, # GST processing style_mel = None if CONFIG.model == "TacotronGST" and style_wav is not None: - style_mel = compute_style_mel(style_wav, ap, use_cuda) + style_mel = compute_style_mel(style_wav, ap) # preprocess the given text - inputs = text_to_seqvec(text, CONFIG, use_cuda) + inputs = text_to_seqvec(text, CONFIG) # pass tensors to backend if backend == 'torch': speaker_id = id_to_torch(speaker_id) diff --git a/utils/training.py b/utils/training.py index bd314bc9..6739132e 100644 --- a/utils/training.py +++ b/utils/training.py @@ -9,7 +9,7 @@ def check_update(model, grad_clip, ignore_stopnet=False): grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip) else: grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) - if torch.isinf(grad_norm): + if np.isinf(grad_norm): print(" | > Gradient is INF !!") skip_flag = True return grad_norm, skip_flag @@ -62,6 +62,7 @@ def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn }] +# pylint: disable=protected-access class NoamLR(torch.optim.lr_scheduler._LRScheduler): def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1): self.warmup_steps = float(warmup_steps) @@ -87,4 +88,4 @@ def gradual_training_scheduler(global_step, config): for values in config.gradual_training: if global_step * num_gpus >= values[0]: new_values = values - return new_values[1], new_values[2] \ No newline at end of file + return new_values[1], new_values[2] From 97cd39bf99cd954a865200e4777b117dbff0ac8a Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 19 May 2020 16:44:37 +0200 Subject: [PATCH 086/104] console logger fix --- utils/console_logger.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/console_logger.py b/utils/console_logger.py index 5a37ac10..5c6ec75f 100644 --- a/utils/console_logger.py +++ b/utils/console_logger.py @@ -86,10 +86,10 @@ class ConsoleLogger(): sign = '+' diff = 0 if self.old_eval_loss_dict is not None: - diff = self.old_eval_loss_dict[key] - value - if diff > 0: + diff = value - self.old_eval_loss_dict[key] + if diff < 0: color = tcolors.OKGREEN - sign = '-' + sign = '' log_text += "{}{}:{} {:.5f} {}({}{:.5f})\n".format(indent, key, color, value, tcolors.ENDC, sign, diff) self.old_eval_loss_dict = avg_loss_dict print(log_text, flush=True) \ No newline at end of file From dc166b42e37a20ab950c3d39cfacb33444dca102 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 11:55:32 +0200 Subject: [PATCH 087/104] update config.json --- config.json | 16 ++++++++-------- utils/training.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/config.json b/config.json index ffc8fee5..1180c12a 100644 --- a/config.json +++ b/config.json @@ -34,7 +34,7 @@ "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. "min_level_db": -100, // lower bound for normalization "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, @@ -74,15 +74,15 @@ // OPTIMIZER "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1.0, // upper limit for gradients for clipping. + "grad_clip": 1.0, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "wd": 0.000001, // Weight decay weight. + "wd": 0.000001, // Weight decay weight. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. // TACOTRON PRENET - "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. + "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. "prenet_type": "original", // "original" or "bn". "prenet_dropout": true, // enable/disable dropout at prenet. @@ -91,15 +91,15 @@ "attention_heads": 4, // number of attention heads (only for 'graves') "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. + "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. + "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. // STOPNET "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. + "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. // TENSORBOARD and LOGGING "print_step": 25, // Number of steps to log traning on console. diff --git a/utils/training.py b/utils/training.py index 6739132e..ebf8fd13 100644 --- a/utils/training.py +++ b/utils/training.py @@ -9,7 +9,7 @@ def check_update(model, grad_clip, ignore_stopnet=False): grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip) else: grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) - if np.isinf(grad_norm): + if torch.isinf(grad_norm): print(" | > Gradient is INF !!") skip_flag = True return grad_norm, skip_flag From 1835628335aaf5636f96798e2e3dd2b7c7347129 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 12:25:24 +0200 Subject: [PATCH 088/104] tf conversion fixes --- tf/convert_tacotron2_torch_to_tf.py | 13 +++++++------ tf/utils/convert_torch_to_tf_utils.py | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tf/convert_tacotron2_torch_to_tf.py b/tf/convert_tacotron2_torch_to_tf.py index 3b57782e..b1878343 100644 --- a/tf/convert_tacotron2_torch_to_tf.py +++ b/tf/convert_tacotron2_torch_to_tf.py @@ -13,9 +13,9 @@ from fuzzywuzzy import fuzz from TTS.utils.text.symbols import phonemes, symbols from TTS.utils.generic_utils import setup_model from TTS.utils.io import load_config -from TTS_tf.models.tacotron2 import Tacotron2 -from TTS_tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name -from TTS_tf.utils.generic_utils import save_checkpoint +from TTS.tf.models.tacotron2 import Tacotron2 +from TTS.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name +from TTS.tf.utils.generic_utils import save_checkpoint parser = argparse.ArgumentParser() parser.add_argument('--torch_model_path', @@ -147,21 +147,22 @@ output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf, training=False) assert compare_torch_tf(output, output_tf).mean() < 1e-5 -# compare decoder.attention query = output inputs = torch.rand([1, 128, 512]) query_tf = query.detach().numpy() inputs_tf = inputs.numpy() +# compare decoder.attention model.decoder.attention.init_states(inputs) processes_inputs = model.decoder.attention.preprocess_inputs(inputs) loc_attn, proc_query = model.decoder.attention.get_location_attention( query, processes_inputs) context = model.decoder.attention(query, inputs, processes_inputs, None) +attention_states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)[-1] model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf)) -loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf) -context_tf = model_tf.decoder.attention(query_tf, training=False) +loc_attn_tf, proc_query_tf = model_tf.decoder.attention.get_loc_attn(query_tf, attention_states) +context_tf, attention, attention_states = model_tf.decoder.attention(query_tf, attention_states, training=False) assert compare_torch_tf(loc_attn, loc_attn_tf).mean() < 1e-5 assert compare_torch_tf(proc_query, proc_query_tf).mean() < 1e-5 diff --git a/tf/utils/convert_torch_to_tf_utils.py b/tf/utils/convert_torch_to_tf_utils.py index ba7e629b..e9e1e8a3 100644 --- a/tf/utils/convert_torch_to_tf_utils.py +++ b/tf/utils/convert_torch_to_tf_utils.py @@ -72,6 +72,7 @@ def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict): numpy_weight = torch_weight.detach().cpu().numpy() assert np.all(tf_var.shape == numpy_weight.shape), f" [!] weight shapes does not match: {tf_var.name} vs {torch_var_name} --> {tf_var.shape} vs {numpy_weight.shape}" tf.keras.backend.set_value(tf_var, numpy_weight) + return tf_vars def load_tf_vars(model_tf, tf_vars): From ca359727bcc326b7008f559b796b857dbeb68ba7 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 12:30:06 +0200 Subject: [PATCH 089/104] config update and change default debug mode --- config.json | 6 +++--- train.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config.json b/config.json index 1180c12a..03907fb0 100644 --- a/config.json +++ b/config.json @@ -118,10 +118,10 @@ "max_seq_len": 153, // DATASET-RELATED: maximum text length // PATHS - "output_path": "/data/rw/home/Trainings/", + "output_path": "/home/erogol/Models/LJSpeech/", // PHONEMES - "phoneme_cache_path": "/root/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. + "phoneme_cache_path": "/media/erogol/data_ssd2/mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages @@ -135,7 +135,7 @@ [ { "name": "ljspeech", - "path": "/root/LJSpeech-1.1/", + "path": "/home/erogol/Data/LJSpeech-1.1/", "meta_file_train": "metadata.csv", "meta_file_val": null } diff --git a/train.py b/train.py index 8ff768f0..e4963ee7 100644 --- a/train.py +++ b/train.py @@ -619,7 +619,7 @@ if __name__ == '__main__': ) parser.add_argument('--debug', type=bool, - default=True, + default=False, help='Do not verify commit integrity to run training.') # DISTRUBUTED From 6ccf32c2b96ded7832223ea0a918b509ec1d1cee Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 14:00:31 +0200 Subject: [PATCH 090/104] update tests --- server/synthesizer.py | 3 ++- tests/test_audio.py | 4 ++-- tests/test_demo_server.py | 2 +- tests/test_tacotron2_tf_model.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index c0069e33..5eb7f0d4 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -7,7 +7,8 @@ import torch import yaml from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import load_config, setup_model +from TTS.utils.io import load_config +from TTS.utils.generic_utils import setup_model from TTS.utils.speakers import load_speaker_mapping # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import diff --git a/tests/test_audio.py b/tests/test_audio.py index 2ede77ce..4b8ee276 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -3,7 +3,7 @@ import unittest from TTS.tests import get_tests_path, get_tests_input_path, get_tests_output_path from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import load_config +from TTS.utils.io import load_config TESTS_PATH = get_tests_path() OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") @@ -172,4 +172,4 @@ class TestAudio(unittest.TestCase): mel_reference = self.ap.melspectrogram(wav) mel_norm = ap.melspectrogram(wav) mel_denorm = ap._denormalize(mel_norm) - assert abs(mel_reference - mel_denorm).max() < 1e-4 \ No newline at end of file + assert abs(mel_reference - mel_denorm).max() < 1e-4 diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index 11d16a45..65653997 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -22,7 +22,7 @@ class DemoServerTest(unittest.TestCase): num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) - save_checkpoint(model, None, None, None, output_path, 10, 10) + save_checkpoint(model, None, None, None, 1, output_path) def test_in_out(self): self._create_random_model() diff --git a/tests/test_tacotron2_tf_model.py b/tests/test_tacotron2_tf_model.py index bc8f0407..aca363a8 100644 --- a/tests/test_tacotron2_tf_model.py +++ b/tests/test_tacotron2_tf_model.py @@ -60,4 +60,4 @@ class TacotronTFTrainTest(unittest.TestCase): assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r) # inference pass - output = model(input, training=False) + output = model(chars_seq, training=False) From cb9ac27b6536cf9e11e73907444b057058c2eb63 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 14:26:47 +0200 Subject: [PATCH 091/104] TTS_tf notebook update --- tf/notebooks/Benchmark-TTS_tf.ipynb | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tf/notebooks/Benchmark-TTS_tf.ipynb b/tf/notebooks/Benchmark-TTS_tf.ipynb index c2b634e6..0464209d 100644 --- a/tf/notebooks/Benchmark-TTS_tf.ipynb +++ b/tf/notebooks/Benchmark-TTS_tf.ipynb @@ -6,18 +6,24 @@ "Collapsed": "false" }, "source": [ - "This is to test TTS models with benchmark sentences for speech synthesis.\n", + "This is to test TTS tensorflow models with benchmark sentences.\n", "\n", "Before running this script please DON'T FORGET: \n", "- to set file paths.\n", - "- to download related model files.\n", + "- to download related models.\n", + " - Sample TF model: https://www.dropbox.com/sh/3b1fat5oxqab6yn/AADDlNs-9-r7ASbVnFYx3RHHa?dl=0\n", "- download or clone related repos, linked below.\n", "- setup the repositories. ```python setup.py install```\n", "- to checkout right commit versions (given next to the model in the models page).\n", "- to set the file paths below.\n", "\n", "Repositories:\n", - "- TTS: https://github.com/mozilla/TTS" + "- TTS: https://github.com/mozilla/TTS\n", + "- PWGAN: https://github.com/erogol/ParallelWaveGAN (if you like to use a vocoder model)\n", + "\n", + "Known Issues:\n", + "- To load the model second time you need to restart the notebook kernel. \n", + "- Some of the advance methods are not yet implemented for Tensorflow." ] }, { @@ -63,7 +69,7 @@ "import IPython\n", "from IPython.display import Audio\n", "\n", - "%matplotlib agg" + "%matplotlib inline" ] }, { @@ -112,7 +118,7 @@ "outputs": [], "source": [ "# Set constants\n", - "ROOT_PATH = '../tf_model/'\n", + "ROOT_PATH = '../torch_model/'\n", "MODEL_PATH = ROOT_PATH + '/tts_tf_checkpoint_360000.pkl'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", From a8934057393b6908de32c6adcd41d3d8c860a04b Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 14:26:56 +0200 Subject: [PATCH 092/104] raise not implemented for multispeaker TTS_tf inference --- utils/synthesis.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/synthesis.py b/utils/synthesis.py index ef3c1105..a53c12dc 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -62,9 +62,11 @@ def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=No raise NotImplementedError(' [!] GST inference not implemented for TF') if truncated: raise NotImplementedError(' [!] Truncated inference not implemented for TF') + if speaker_id is not None: + raise NotImplementedError(' [!] Multi-Speaker not implemented for TF') # TODO: handle multispeaker case decoder_output, postnet_output, alignments, stop_tokens = model( - inputs, speaker_ids=speaker_id) + inputs, training=False) return decoder_output, postnet_output, alignments, stop_tokens From ed67cadf986390013fafe48998c83ac4b7bce438 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 15:06:41 +0200 Subject: [PATCH 093/104] requirements for testingwq --- .travis.yml | 2 +- requirements_tests.txt | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 requirements_tests.txt diff --git a/.travis.yml b/.travis.yml index e2f77491..645f9861 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ matrix: env: TEST_SUITE="lint" - name: "Unit tests" python: "3.6" - install: pip install --quiet -r requirements.txt + install: pip install --quiet -r requirements_tests.txt env: TEST_SUITE="unittest" script: ./.travis/script diff --git a/requirements_tests.txt b/requirements_tests.txt new file mode 100644 index 00000000..59c5f1b0 --- /dev/null +++ b/requirements_tests.txt @@ -0,0 +1,15 @@ +numpy>=1.14.3 +torch>=0.4.1 +tensorflow>=2.2 +librosa>=0.5.1 +Unidecode>=0.4.20 +tensorboard +tensorboardX +matplotlib +Pillow +flask +scipy +tqdm +soundfile +phonemizer +bokeh==1.4.0 From 4a6949632b120c47582384eb965a88df9742cff1 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 16:05:57 +0200 Subject: [PATCH 094/104] bug fixes and benchmark notebook update --- notebooks/Benchmark-PWGAN.ipynb | 585 ---------------------------- notebooks/Benchmark.ipynb | 546 -------------------------- speaker_encoder/tests.py | 2 +- tf/notebooks/Benchmark-TTS_tf.ipynb | 6 +- 4 files changed, 4 insertions(+), 1135 deletions(-) delete mode 100644 notebooks/Benchmark-PWGAN.ipynb delete mode 100644 notebooks/Benchmark.ipynb diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb deleted file mode 100644 index 082ffa60..00000000 --- a/notebooks/Benchmark-PWGAN.ipynb +++ /dev/null @@ -1,585 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is to test TTS models with benchmark sentences for speech synthesis.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related model files from TTS and PWGAN.\n", - "- download or clone related repos, linked below.\n", - "- setup the repositories. ```python setup.py install```\n", - "- to checkout right commit versions (given next to the model) of TTS and PWGAN.\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repositories:\n", - "- TTS: https://github.com/mozilla/TTS\n", - "- PWGAN: https://github.com/erogol/ParallelWaveGAN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "import sys\n", - "import io\n", - "import torch \n", - "import time\n", - "import json\n", - "import yaml\n", - "import numpy as np\n", - "from collections import OrderedDict\n", - "import matplotlib.pyplot as plt\n", - "plt.rcParams[\"figure.figsize\"] = (16,5)\n", - "\n", - "import librosa\n", - "import librosa.display\n", - "\n", - "from TTS.models.tacotron import Tacotron \n", - "from TTS.layers import *\n", - "from TTS.utils.data import *\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.generic_utils import load_config, setup_model\n", - "from TTS.utils.text import text_to_sequence\n", - "from TTS.utils.synthesis import synthesis\n", - "from TTS.utils.visual import visualize\n", - "\n", - "import IPython\n", - "from IPython.display import Audio\n", - "\n", - "import os\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", - " if CONFIG.model == \"Tacotron\" and not use_gl:\n", - " # coorect the normalization differences b/w TTS and the Vocoder.\n", - " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", - "# mel_postnet_spec = np.pad(mel_postnet_spec, pad_width=((2, 2), (0, 0)))\n", - " print(mel_postnet_spec.shape)\n", - " print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n", - " if not use_gl:\n", - " waveform = vocoder_model.inference(torch.FloatTensor(ap_vocoder._normalize(mel_postnet_spec).T).unsqueeze(0), hop_size=ap_vocoder.hop_length)\n", - "# waveform = waveform / abs(waveform).max() * 0.9\n", - " if use_cuda:\n", - " waveform = waveform.cpu()\n", - " waveform = waveform.numpy()\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " if figures: \n", - " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec)) \n", - " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False)) \n", - " os.makedirs(OUT_FOLDER, exist_ok=True)\n", - " file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", - " out_path = os.path.join(OUT_FOLDER, file_name)\n", - " ap.save_wav(waveform, out_path)\n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set constants\n", - "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n", - "MODEL_PATH = ROOT_PATH + '/checkpoint_670000.pth.tar'\n", - "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", - "CONFIG = load_config(CONFIG_PATH)\n", - "VOCODER_MODEL_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/checkpoint-400000steps.pkl\"\n", - "VOCODER_CONFIG_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/config.yml\"\n", - "\n", - "# load PWGAN config\n", - "with open(VOCODER_CONFIG_PATH) as f:\n", - " VOCODER_CONFIG = yaml.load(f, Loader=yaml.Loader)\n", - " \n", - "# Run FLAGs\n", - "use_cuda = False\n", - "# Set some config fields manually for testing\n", - "CONFIG.windowing = True\n", - "CONFIG.use_forward_attn = True \n", - "# Set the vocoder\n", - "use_gl = False # use GL if True\n", - "batched_wavernn = True # use batched wavernn inference if True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", - "\n", - "# multi speaker \n", - "if CONFIG.use_speaker_embedding:\n", - " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", - " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", - "else:\n", - " speakers = []\n", - " speaker_id = None\n", - "\n", - "# if the vocabulary was passed, replace the default\n", - "if 'characters' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.characters)\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, len(speakers), CONFIG)\n", - "\n", - "# load the audio processor\n", - "ap = AudioProcessor(**CONFIG.audio) \n", - "\n", - "\n", - "# load model state\n", - "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", - "\n", - "# load the model\n", - "model.load_state_dict(cp['model'])\n", - "if use_cuda:\n", - " model.cuda()\n", - "model.eval()\n", - "print(cp['step'])\n", - "print(cp['r'])\n", - "\n", - "# set model stepsize\n", - "if 'r' in cp:\n", - " model.decoder.set_r(cp['r'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# LOAD WAVERNN\n", - "if use_gl == False:\n", - " from parallel_wavegan.models import ParallelWaveGANGenerator\n", - " from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder\n", - " \n", - " vocoder_model = ParallelWaveGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n", - " vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n", - " vocoder_model.remove_weight_norm()\n", - " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG['audio']) \n", - " if use_cuda:\n", - " vocoder_model.cuda()\n", - " vocoder_model.eval();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparision with https://mycroft.ai/blog/available-voices/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.eval()\n", - "model.decoder.max_decoder_steps = 2000\n", - "model.decoder.prenet.eval()\n", - "speaker_id = None\n", - "sentence = '''A breeding jennet, lusty, young, and proud,'''\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### https://espnet.github.io/icassp2020-tts/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The Commission also recommends\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Other examples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The human voice is the most perfect instrument of all.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This cake is great. It's so delicious and moist.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparison with https://keithito.github.io/audio-samples/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \" He has read the whole thing.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"He reads books.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Thisss isrealy awhsome.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser, Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Eren, how are you?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hard Sentences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Encouraged, he started with a minute a day.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"If he decided to watch TV he really watched it.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for twb dataset\n", - "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb deleted file mode 100644 index 7d3a45cf..00000000 --- a/notebooks/Benchmark.ipynb +++ /dev/null @@ -1,546 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is to test TTS models with benchmark sentences for speech synthesis.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related model files from TTS and WaveRNN.\n", - "- to checkout right commit versions (given next to the model) of TTS and WaveRNN.\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repositories:\n", - "- TTS: https://github.com/mozilla/TTS\n", - "- WaveRNN: https://github.com/erogol/WaveRNN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TTS_PATH = \"/home/erogol/projects/\"\n", - "WAVERNN_PATH =\"/home/erogol/projects/\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "import sys\n", - "import io\n", - "import torch \n", - "import time\n", - "import json\n", - "import numpy as np\n", - "from collections import OrderedDict\n", - "from matplotlib import pylab as plt\n", - "\n", - "%pylab inline\n", - "rcParams[\"figure.figsize\"] = (16,5)\n", - "\n", - "# add libraries into environment\n", - "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", - "sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally\n", - "\n", - "import librosa\n", - "import librosa.display\n", - "\n", - "from TTS.models.tacotron import Tacotron \n", - "from TTS.layers import *\n", - "from TTS.utils.data import *\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.generic_utils import load_config, setup_model\n", - "from TTS.utils.text import text_to_sequence\n", - "from TTS.utils.synthesis import synthesis\n", - "from TTS.utils.visual import visualize\n", - "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", - "\n", - "import IPython\n", - "from IPython.display import Audio\n", - "\n", - "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, \n", - " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", - " use_griffin_lim=use_gl)\n", - " if CONFIG.model == \"Tacotron\" and not use_gl:\n", - " # coorect the normalization differences b/w TTS and the Vocoder.\n", - " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " if not use_gl:\n", - " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", - " mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n", - " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)\n", - "\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " if figures: \n", - " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec) \n", - " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", - " os.makedirs(OUT_FOLDER, exist_ok=True)\n", - " file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", - " out_path = os.path.join(OUT_FOLDER, file_name)\n", - " ap.save_wav(waveform, out_path)\n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set constants\n", - "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n", - "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", - "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", - "CONFIG = load_config(CONFIG_PATH)\n", - "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar\"\n", - "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json\"\n", - "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n", - "use_cuda = True\n", - "\n", - "# Set some config fields manually for testing\n", - "# CONFIG.windowing = False\n", - "# CONFIG.prenet_dropout = False\n", - "# CONFIG.separate_stopnet = True\n", - "CONFIG.use_forward_attn = True\n", - "# CONFIG.forward_attn_mask = True\n", - "# CONFIG.stopnet = True\n", - "\n", - "# Set the vocoder\n", - "use_gl = True # use GL if True\n", - "batched_wavernn = True # use batched wavernn inference if True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "# multi speaker \n", - "if CONFIG.use_speaker_embedding:\n", - " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", - " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", - "else:\n", - " speakers = []\n", - " speaker_id = None\n", - "\n", - "# if the vocabulary was passed, replace the default\n", - "if 'characters' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.characters)\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, len(speakers), CONFIG)\n", - "\n", - "# load the audio processor\n", - "ap = AudioProcessor(**CONFIG.audio) \n", - "\n", - "\n", - "# load model state\n", - "if use_cuda:\n", - " cp = torch.load(MODEL_PATH)\n", - "else:\n", - " cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n", - "\n", - "# load the model\n", - "model.load_state_dict(cp['model'])\n", - "if use_cuda:\n", - " model.cuda()\n", - "model.eval()\n", - "print(cp['step'])\n", - "print(cp['r'])\n", - "\n", - "# set model stepsize\n", - "if 'r' in cp:\n", - " model.decoder.set_r(cp['r'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# LOAD WAVERNN - Make sure you downloaded the model and installed the module\n", - "if use_gl == False:\n", - " from WaveRNN.models.wavernn import Model\n", - " from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n", - " bits = 10\n", - " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n", - " wavernn = Model(\n", - " rnn_dims=512,\n", - " fc_dims=512,\n", - " mode=VOCODER_CONFIG.mode,\n", - " mulaw=VOCODER_CONFIG.mulaw,\n", - " pad=VOCODER_CONFIG.pad,\n", - " upsample_factors=VOCODER_CONFIG.upsample_factors,\n", - " feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n", - " compute_dims=128,\n", - " res_out_dims=128,\n", - " res_blocks=10,\n", - " hop_length=ap_vocoder.hop_length,\n", - " sample_rate=ap_vocoder.sample_rate,\n", - " use_upsample_net = True,\n", - " use_aux_net = True\n", - " ).cuda()\n", - "\n", - " check = torch.load(VOCODER_MODEL_PATH)\n", - " wavernn.load_state_dict(check['model'], strict=False)\n", - " if use_cuda:\n", - " wavernn.cuda()\n", - " wavernn.eval();\n", - " print(check['step'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparision with https://mycroft.ai/blog/available-voices/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.eval()\n", - "model.decoder.max_decoder_steps = 2000\n", - "speaker_id = None\n", - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.eval()\n", - "model.decoder.max_decoder_steps = 2000\n", - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The human voice is the most perfect instrument of all.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This cake is great. It's so delicious and moist.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparison with https://keithito.github.io/audio-samples/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \" He has read the whole thing.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"He reads books.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Thisss isrealy awhsome.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser, Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Eren, how are you?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hard Sentences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Encouraged, he started with a minute a day.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"If he decided to watch TV he really watched it.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for twb dataset\n", - "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/speaker_encoder/tests.py b/speaker_encoder/tests.py index 220ba360..039833fc 100644 --- a/speaker_encoder/tests.py +++ b/speaker_encoder/tests.py @@ -4,7 +4,7 @@ import torch as T from TTS.speaker_encoder.model import SpeakerEncoder from TTS.speaker_encoder.loss import GE2ELoss -from TTS.utils.generic_utils import load_config +from TTS.utils.io import load_config file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/" diff --git a/tf/notebooks/Benchmark-TTS_tf.ipynb b/tf/notebooks/Benchmark-TTS_tf.ipynb index 0464209d..4a21ae17 100644 --- a/tf/notebooks/Benchmark-TTS_tf.ipynb +++ b/tf/notebooks/Benchmark-TTS_tf.ipynb @@ -124,10 +124,10 @@ "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", "CONFIG = load_config(CONFIG_PATH)\n", "# Run FLAGs\n", - "use_cuda = True\n", + "use_cuda = True # use the available GPU (only for torch)\n", "# Set the vocoder\n", - "use_gl = True # use GL if True\n", - "BACKEND = 'tf'" + "use_gl = True # use GL if True\n", + "BACKEND = 'tf' # set the backend for inference " ] }, { From ddd7de643975d2c5bd63d87595fa37fc608bfbff Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 16:12:10 +0200 Subject: [PATCH 095/104] update notebook --- notebooks/TestAttention.ipynb | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index b350b070..92b1d6c4 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -2,15 +2,22 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "source": [ - "This notebook is to test attention performance on hard sentences taken from DeepVoice paper." + "This notebook is to test attention performance of a TTS model on a list of sentences taken from DeepVoice paper.\n", + "### Features of this notebook\n", + "- You can see visually how your model performs on each sentence and try to dicern common problems.\n", + "- At the end, final attention score would be printed showing the ultimate performace of your model. You can use this value to perform model selection.\n", + "- You can change the list of sentences byt providing a different sentence file." ] }, { "cell_type": "code", "execution_count": null, "metadata": { + "Collapsed": "false", "scrolled": true }, "outputs": [], @@ -31,7 +38,8 @@ "\n", "from TTS.layers import *\n", "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.generic_utils import load_config, setup_model\n", + "from TTS.utils.generic_utils import setup_model\n", + "from TTS.utils.io import load_config\n", "from TTS.utils.text import text_to_sequence\n", "from TTS.utils.synthesis import synthesis\n", "from TTS.utils.visual import plot_alignment\n", @@ -45,7 +53,7 @@ "def tts(model, text, CONFIG, use_cuda, ap):\n", " t_1 = time.time()\n", " # run the model\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)\n", + " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", " # plotting\n", @@ -62,7 +70,7 @@ " return attn_score\n", "\n", "# Set constants\n", - "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-March-17-2020_01+16AM-871588c/'\n", + "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n", "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", "OUT_FOLDER = './hard_sentences/'\n", @@ -82,7 +90,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ "# LOAD TTS MODEL\n", @@ -130,7 +140,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ "model.decoder.max_decoder_steps=3000\n", @@ -144,7 +156,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "Collapsed": "false" + }, "outputs": [], "source": [ "np.mean(attn_scores)" From 263dc2f7ce067ad5535f779f9ff2d0bad2cb4a2f Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 16:27:48 +0200 Subject: [PATCH 096/104] fix weird lint problem but not caring --- tests/test_demo_server.py | 2 +- utils/audio.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_demo_server.py b/tests/test_demo_server.py index 65653997..51cbf341 100644 --- a/tests/test_demo_server.py +++ b/tests/test_demo_server.py @@ -22,7 +22,7 @@ class DemoServerTest(unittest.TestCase): num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) - save_checkpoint(model, None, None, None, 1, output_path) + save_checkpoint(model, None, 10, 10, 1, output_path) def test_in_out(self): self._create_random_model() diff --git a/utils/audio.py b/utils/audio.py index 413b6163..13eab3d6 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -157,7 +157,7 @@ class AudioProcessor(object): ### Mean-STD scaling ### def load_stats(self, stats_path): - stats = np.load(stats_path, allow_pickle=True).item() + stats = np.load(stats_path, allow_pickle=True).item() #pylint: disable=unexpected-keyword-arg mel_mean = stats['mel_mean'] mel_std = stats['mel_std'] linear_mean = stats['linear_mean'] From c4e688e06782545f1153aaa6a88b911c67edbe27 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 16:44:52 +0200 Subject: [PATCH 097/104] update README --- README.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 19d7fa24..4adae507 100644 --- a/README.md +++ b/README.md @@ -14,11 +14,18 @@ If you are new, you can also find [here](http://www.erogol.com/text-speech-deep- [Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results) -## Utilities under this Project -- Deep Learning based Text2Speech model. -- ```dataset_analysis```: Tools to curate a Text2Speech dataset. -- ```speaker_encoder```: Speaker Encoder model computing embedding vectors for voice files. -- ```server```: Basic server implementation with packaging. +## Features +- High performance Text2Speech models on Torch and Tensorflow 2.0. +- High performance Speaker Encoder to compute speaker embeddings efficiently. +- Integration with various Neural Vocoders (PWGAN, MelGAN, WaveRNN) +- Released trained models. +- Efficient training codes for PyTorch. (soon for Tensorflow 2.0) +- Codes to convert Torch models to Tensorflow 2.0. +- Detailed training anlaysis on console and Tensorboard. +- Tools to curate Text2Speech datasets under```dataset_analysis```. +- Demo server for model testing. +- Notebooks for extensive model benchmarking. +- Modular (but not too much) code base enabling easy testing for new ideas. ## Requirements and Installation Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation. From fb89b6b4ff95f8d01900173e3779b7fb0fe8a4bd Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 16:55:57 +0200 Subject: [PATCH 098/104] server fix --- server/synthesizer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/server/synthesizer.py b/server/synthesizer.py index 5eb7f0d4..765aab8a 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -175,13 +175,15 @@ class Synthesizer(object): for sen in sens: # preprocess the given text - inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) + inputs = text_to_seqvec(sen, self.tts_config) + inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda) + inputs = inputs.unsqueeze(0) # synthesize voice - decoder_output, postnet_output, alignments, _ = run_model_torch( + decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( self.tts_model, inputs, self.tts_config, False, speaker_id, None) # convert outputs to numpy - postnet_output, decoder_output, _ = parse_outputs( - postnet_output, decoder_output, alignments) + postnet_output, decoder_output, _, _ = parse_outputs_torch( + postnet_output, decoder_output, alignments, stop_tokens) if self.pwgan: vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) From 6b6ba4d420f8cc49f9254938b7b8835dcf2351e8 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 17:06:54 +0200 Subject: [PATCH 099/104] update requirements --- requirements.txt | 2 +- requirements_tests.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 47fa1ec0..5f31db70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy>=1.14.3 +numpy>=1.16.0 torch>=0.4.1 librosa>=0.5.1 Unidecode>=0.4.20 diff --git a/requirements_tests.txt b/requirements_tests.txt index 59c5f1b0..1e0615b2 100644 --- a/requirements_tests.txt +++ b/requirements_tests.txt @@ -1,4 +1,4 @@ -numpy>=1.14.3 +numpy>=1.16.0 torch>=0.4.1 tensorflow>=2.2 librosa>=0.5.1 From 1a0d8c155854abed6b26069b89b35d7b5087a47b Mon Sep 17 00:00:00 2001 From: thllwg Date: Wed, 20 May 2020 17:28:50 +0200 Subject: [PATCH 100/104] fix: wrong script referenced --- server/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/README.md b/server/README.md index 0563ef94..3c65c961 100644 --- a/server/README.md +++ b/server/README.md @@ -7,7 +7,7 @@ Instructions below are based on a Ubuntu 18.04 machine, but it should be simple #### Development server: ##### Using server.py -If you have the environment set already for TTS, then you can directly call ```setup.py```. +If you have the environment set already for TTS, then you can directly call ```server.py```. ##### Using .whl 1. apt-get install -y espeak libsndfile1 python3-venv From 5a99986e86a7b7dddeef7692cb2ca83a237cc2e3 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 17:30:25 +0200 Subject: [PATCH 101/104] update numpy setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f92dac8a..a1987a60 100644 --- a/setup.py +++ b/setup.py @@ -93,7 +93,7 @@ setup( install_requires=[ "scipy>=0.19.0", "torch>=0.4.1", - "numpy==1.15.4", + "numpy>=1.16.0", "librosa==0.6.2", "unidecode==0.4.20", "attrdict", From 7931a106e234430c449a42c4e2032b9f0e5098ea Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 17:41:55 +0200 Subject: [PATCH 102/104] update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a1987a60..5e89723b 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ args, unknown_args = parser.parse_known_args() # Remove our arguments from argv so that setuptools doesn't see them sys.argv = [sys.argv[0]] + unknown_args -version = '0.0.1' +version = '0.0.2' # Adapted from https://github.com/pytorch/pytorch cwd = os.path.dirname(os.path.abspath(__file__)) From 2a071a75055b5b266fe982602585be35d2667ad0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 20 May 2020 17:51:42 +0200 Subject: [PATCH 103/104] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4adae507..6a90ffbf 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,8 @@ TTS includes two different model implementations which are based on [Tacotron](h If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about TTS architectures and their comparisons. +[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/0)](https://sourcerer.io/fame/erogol/erogol/TTS/links/0)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/1)](https://sourcerer.io/fame/erogol/erogol/TTS/links/1)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/2)](https://sourcerer.io/fame/erogol/erogol/TTS/links/2)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/3)](https://sourcerer.io/fame/erogol/erogol/TTS/links/3)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/4)](https://sourcerer.io/fame/erogol/erogol/TTS/links/4)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/5)](https://sourcerer.io/fame/erogol/erogol/TTS/links/5)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/6)](https://sourcerer.io/fame/erogol/erogol/TTS/links/6)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/7)](https://sourcerer.io/fame/erogol/erogol/TTS/links/7) + ## TTS Performance

From cedc22847beff4a3f8f0b088463b37355717b36d Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 20 May 2020 18:16:24 +0200 Subject: [PATCH 104/104] fix synthesize.py --- config_template.json | 134 ------------------------------------------- synthesize.py | 3 +- 2 files changed, 2 insertions(+), 135 deletions(-) delete mode 100644 config_template.json diff --git a/config_template.json b/config_template.json deleted file mode 100644 index e525ec31..00000000 --- a/config_template.json +++ /dev/null @@ -1,134 +0,0 @@ -{ - "model": "Tacotron2", // one of the model in models/ - "run_name": "ljspeech-stft_params", - "run_description": "tacotron2 cosntant stf parameters", - - // AUDIO PARAMETERS - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 513, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. - }, - - // VOCABULARY PARAMETERS - // if custom character set is not defined, - // default set in symbols.py is used - "characters":{ - "pad": "_", - "eos": "~", - "bos": "^", - "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", - "punctuations":"!'(),-.:;? ", - "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" - }, - - // DISTRIBUTED TRAINING - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54321" - }, - - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":16, - "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. - "loss_masking": true, // enable / disable loss masking against the sequence padding. - "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // OPTIMIZER - "noam_schedule": false, // use noam warmup and lr schedule. - "grad_clip": 1.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "wd": 0.000001, // Weight decay weight. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - - // TACOTRON PRENET - "memory_size": -1, // ONLY TACOTRON - size of the memory queue used fro storing last decoder predictions for auto-regression. If < 0, memory queue is disabled and decoder only uses the last prediction frame. - "prenet_type": "original", // "original" or "bn". - "prenet_dropout": true, // enable/disable dropout at prenet. - - // ATTENTION - "attention_type": "original", // 'original' or 'graves' - "attention_heads": 4, // number of attention heads (only for 'graves') - "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "windowing": false, // Enables attention windowing. Used only in eval mode. - "use_forward_attn": false, // if it uses forward attention. In general, it aligns faster. - "forward_attn_mask": false, // Additional masking forcing monotonicity only in eval mode. - "transition_agent": false, // enable/disable transition agent of forward attention. - "location_attn": true, // enable_disable location sensitive attention. It is enabled for TACOTRON by default. - "bidirectional_decoder": false, // use https://arxiv.org/abs/1907.09006. Use it, if attention does not work well with your dataset. - - // STOPNET - "stopnet": true, // Train stopnet predicting the end of synthesis. - "separate_stopnet": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER. - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. - "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "text_cleaner": "phoneme_cleaners", - "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "batch_group_size": 0, //Number of batches to shuffle after bucketing. - "min_seq_len": 6, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 153, // DATASET-RELATED: maximum text length - - // PATHS - "output_path": "/data4/rw/home/Trainings/", - - // PHONEMES - "phoneme_cache_path": "mozilla_us_phonemes_3", // phoneme computation is slow, therefore, it caches results in the given folder. - "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages - - // MULTI-SPEAKER and GST - "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. - "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. - "use_gst": false, // TACOTRON ONLY: use global style tokens - - // DATASETS - "datasets": // List of datasets. They all merged and they get different speaker_ids. - [ - { - "name": "ljspeech", - "path": "/root/LJSpeech-1.1/", - "meta_file_train": "metadata.csv", - "meta_file_val": null - } - ] - -} - diff --git a/synthesize.py b/synthesize.py index 1a760268..18048c2f 100644 --- a/synthesize.py +++ b/synthesize.py @@ -7,7 +7,8 @@ import json import string from TTS.utils.synthesis import synthesis -from TTS.utils.generic_utils import load_config, setup_model +from TTS.utils.generic_utils import setup_model +from TTS.utils.io import load_config from TTS.utils.text.symbols import make_symbols, symbols, phonemes from TTS.utils.audio import AudioProcessor