From 0e43bfa54ff56cab6c87f7d3562199bc745bf698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 5 Oct 2020 14:50:43 +0200 Subject: [PATCH 01/16] Update README.md update MOS figure --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 136b2ac5..9973b04f 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ If you are new, you can also find [here](http://www.erogol.com/text-speech-deep- [![](https://sourcerer.io/fame/erogol/erogol/TTS/images/0)](https://sourcerer.io/fame/erogol/erogol/TTS/links/0)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/1)](https://sourcerer.io/fame/erogol/erogol/TTS/links/1)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/2)](https://sourcerer.io/fame/erogol/erogol/TTS/links/2)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/3)](https://sourcerer.io/fame/erogol/erogol/TTS/links/3)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/4)](https://sourcerer.io/fame/erogol/erogol/TTS/links/4)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/5)](https://sourcerer.io/fame/erogol/erogol/TTS/links/5)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/6)](https://sourcerer.io/fame/erogol/erogol/TTS/links/6)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/7)](https://sourcerer.io/fame/erogol/erogol/TTS/links/7) ## TTS Performance -

+

[Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results) From acda8a6e338e1838056dab4aa3e82378181b9875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 5 Oct 2020 14:52:52 +0200 Subject: [PATCH 02/16] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9973b04f..1d357b87 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ If you are new, you can also find [here](http://www.erogol.com/text-speech-deep- ## TTS Performance

+"Mozilla*" and "Judy*" are our models. [Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results) ## Provided Models and Methods From 2f5a08d04716025c936dfc75b6803c038307e65d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 15 Oct 2020 03:49:39 +0200 Subject: [PATCH 03/16] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 64629993..1413ad6a 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,11 @@ This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). Mozilla TTS aims a deep learning based Text2Speech engine, low in cost and high in quality. -You can check some of synthesized voice samples from [here](https://erogol.github.io/ddc-samples/). +English Voice Samples: https://erogol.github.io/ddc-samples/ -If you are new, you can also find [here](http://www.erogol.com/text-speech-deep-learning-architectures/) a brief post about some of TTS architectures and [here](https://github.com/erogol/TTS-papers) list of up-to-date research papers. +TTS training recipes: https://github.com/erogol/TTS_recipes + +TTS paper collection: https://github.com/erogol/TTS-papers [![](https://sourcerer.io/fame/erogol/erogol/TTS/images/0)](https://sourcerer.io/fame/erogol/erogol/TTS/links/0)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/1)](https://sourcerer.io/fame/erogol/erogol/TTS/links/1)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/2)](https://sourcerer.io/fame/erogol/erogol/TTS/links/2)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/3)](https://sourcerer.io/fame/erogol/erogol/TTS/links/3)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/4)](https://sourcerer.io/fame/erogol/erogol/TTS/links/4)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/5)](https://sourcerer.io/fame/erogol/erogol/TTS/links/5)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/6)](https://sourcerer.io/fame/erogol/erogol/TTS/links/6)[![](https://sourcerer.io/fame/erogol/erogol/TTS/images/7)](https://sourcerer.io/fame/erogol/erogol/TTS/links/7) From 98f15e1154324533fd75de12e2fbd610fc306d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 15 Oct 2020 03:50:53 +0200 Subject: [PATCH 04/16] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1413ad6a..53ec1aad 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,9 @@
-This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). +Mozilla TTS is a deep learning based Text2Speech project, low in cost and high in quality. -Mozilla TTS aims a deep learning based Text2Speech engine, low in cost and high in quality. +This project is a part of [Mozilla Common Voice](https://voice.mozilla.org/en). English Voice Samples: https://erogol.github.io/ddc-samples/ From d6bd3cd8b8981add60fa727dd6385b2f1d99822a Mon Sep 17 00:00:00 2001 From: Alex K Date: Thu, 15 Oct 2020 19:14:50 +0200 Subject: [PATCH 05/16] add initial wavernn support --- TTS/bin/compute_statistics.py | 53 +- ...{train_vocoder.py => train_gan_vocoder.py} | 344 ++++++------ TTS/bin/train_wavernn_vocoder.py | 493 ++++++++++++++++++ TTS/vocoder/configs/wavernn_config.json | 95 ++++ TTS/vocoder/datasets/preprocess.py | 8 +- TTS/vocoder/utils/generic_utils.py | 20 + 6 files changed, 838 insertions(+), 175 deletions(-) rename TTS/bin/{train_vocoder.py => train_gan_vocoder.py} (68%) create mode 100644 TTS/bin/train_wavernn_vocoder.py create mode 100644 TTS/vocoder/configs/wavernn_config.json diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 1c6ef94d..9177c75b 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -11,20 +11,27 @@ from TTS.tts.datasets.preprocess import load_meta_data from TTS.utils.io import load_config from TTS.utils.audio import AudioProcessor + def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( - description="Compute mean and variance of spectrogtram features.") - parser.add_argument("--config_path", type=str, required=True, - help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", default=None, type=str, - help="directory to save the output file.") + description="Compute mean and variance of spectrogtram features." + ) + parser.add_argument( + "--config_path", + type=str, + required=True, + help="TTS config file path to define audio processin parameters.", + ) + parser.add_argument( + "--out_path", default=None, type=str, help="directory to save the output file." + ) args = parser.parse_args() # load config CONFIG = load_config(args.config_path) - CONFIG.audio['signal_norm'] = False # do not apply earlier normalization - CONFIG.audio['stats_path'] = None # discard pre-defined stats + CONFIG.audio["signal_norm"] = False # do not apply earlier normalization + CONFIG.audio["stats_path"] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) @@ -58,27 +65,27 @@ def main(): output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} - stats['mel_mean'] = mel_mean - stats['mel_std'] = mel_scale - stats['linear_mean'] = linear_mean - stats['linear_std'] = linear_scale + stats["mel_mean"] = mel_mean + stats["mel_std"] = mel_scale + stats["linear_mean"] = linear_mean + stats["linear_std"] = linear_scale - print(f' > Avg mel spec mean: {mel_mean.mean()}') - print(f' > Avg mel spec scale: {mel_scale.mean()}') - print(f' > Avg linear spec mean: {linear_mean.mean()}') - print(f' > Avg lienar spec scale: {linear_scale.mean()}') + print(f" > Avg mel spec mean: {mel_mean.mean()}") + print(f" > Avg mel spec scale: {mel_scale.mean()}") + print(f" > Avg linear spec mean: {linear_mean.mean()}") + print(f" > Avg lienar spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling - CONFIG.audio['stats_path'] = output_file_path - CONFIG.audio['signal_norm'] = True + CONFIG.audio["stats_path"] = output_file_path + CONFIG.audio["signal_norm"] = True # remove redundant values - del CONFIG.audio['max_norm'] - del CONFIG.audio['min_level_db'] - del CONFIG.audio['symmetric_norm'] - del CONFIG.audio['clip_norm'] - stats['audio_config'] = CONFIG.audio + del CONFIG.audio["max_norm"] + del CONFIG.audio["min_level_db"] + del CONFIG.audio["symmetric_norm"] + del CONFIG.audio["clip_norm"] + stats["audio_config"] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) - print(f' > scale_stats.npy is saved to {output_file_path}') + print(f" > scale_stats.npy is saved to {output_file_path}") if __name__ == "__main__": diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_gan_vocoder.py similarity index 68% rename from TTS/bin/train_vocoder.py rename to TTS/bin/train_gan_vocoder.py index b51a55a3..7689c930 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_gan_vocoder.py @@ -10,20 +10,29 @@ import torch from torch.utils.data import DataLoader from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict) +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, +) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data + # from distribute import (DistributedSampler, apply_gradient_allreduce, # init_distributed, reduce_tensor) from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, - setup_generator) +from TTS.vocoder.utils.generic_utils import ( + plot_results, + setup_discriminator, + setup_generator, +) from TTS.vocoder.utils.io import save_best_model, save_checkpoint use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -33,27 +42,30 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not c.run_eval: loader = None else: - dataset = GANDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose) + dataset = GANDataset( + ap=ap, + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=not is_val, + use_noise_augment=c.use_noise_augment, + use_cache=c.use_cache, + verbose=verbose, + ) dataset.shuffle_mapping() # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader(dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=True, - drop_last=False, - sampler=None, - num_workers=c.num_val_loader_workers - if is_val else c.num_loader_workers, - pin_memory=False) + loader = DataLoader( + dataset, + batch_size=1 if is_val else c.batch_size, + shuffle=True, + drop_last=False, + sampler=None, + num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, + pin_memory=False, + ) return loader @@ -80,16 +92,26 @@ def format_data(data): return co, x, None, None -def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, - scheduler_G, scheduler_D, ap, global_step, epoch): +def train( + model_G, + criterion_G, + optimizer_G, + model_D, + criterion_D, + optimizer_D, + scheduler_G, + scheduler_D, + ap, + global_step, + epoch, +): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model_G.train() model_D.train() epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int( - len(data_loader.dataset) / (c.batch_size * num_gpus)) + batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() @@ -145,16 +167,16 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, scores_fake = D_out_fake # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) - loss_G = loss_G_dict['G_loss'] + loss_G_dict = criterion_G( + y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub + ) + loss_G = loss_G_dict["G_loss"] # optimizer generator optimizer_G.zero_grad() loss_G.backward() if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), - c.gen_clip_grad) + torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) optimizer_G.step() if scheduler_G is not None: scheduler_G.step() @@ -199,14 +221,13 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute losses loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict['D_loss'] + loss_D = loss_D_dict["D_loss"] # optimizer discriminator optimizer_D.zero_grad() loss_D.backward() if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), - c.disc_clip_grad) + torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) optimizer_D.step() if scheduler_D is not None: scheduler_D.step() @@ -221,34 +242,40 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, epoch_time += step_time # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]['lr'] - current_lr_D = list(optimizer_D.param_groups)[0]['lr'] + current_lr_G = list(optimizer_G.param_groups)[0]["lr"] + current_lr_D = list(optimizer_D.param_groups)[0]["lr"] # update avg stats update_train_values = dict() for key, value in loss_dict.items(): - update_train_values['avg_' + key] = value - update_train_values['avg_loader_time'] = loader_time - update_train_values['avg_step_time'] = step_time + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time keep_avg.update_values(update_train_values) # print training stats if global_step % c.print_step == 0: log_dict = { - 'step_time': [step_time, 2], - 'loader_time': [loader_time, 4], + "step_time": [step_time, 2], + "loader_time": [loader_time, 4], "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D + "current_lr_D": current_lr_D, } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, - log_dict, loss_dict, keep_avg.avg_values) + c_logger.print_train_step( + batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) # plot step stats if global_step % 10 == 0: iter_stats = { "lr_G": current_lr_G, "lr_D": current_lr_D, - "step_time": step_time + "step_time": step_time, } iter_stats.update(loss_dict) tb_logger.tb_train_iter_stats(global_step, iter_stats) @@ -257,27 +284,28 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict) + save_checkpoint( + model_G, + optimizer_G, + scheduler_G, + model_D, + optimizer_D, + scheduler_D, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + tb_logger.tb_train_audios( + global_step, {"train/audio": sample_voice}, c.audio["sample_rate"] + ) end_time = time.time() # print epoch stats @@ -326,7 +354,6 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) y_hat = model_G.pqmf_synthesis(y_hat) y_G_sub = model_G.pqmf_analysis(y_G) - scores_fake, feats_fake, feats_real = None, None, None if global_step > c.steps_to_start_discriminator: @@ -352,8 +379,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) feats_fake, feats_real = None, None # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) + loss_G_dict = criterion_G( + y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub + ) loss_dict = dict() for key, value in loss_G_dict.items(): @@ -403,16 +431,15 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) else: loss_dict[key] = value.item() - step_time = time.time() - start_time epoch_time += step_time # update avg stats update_eval_values = dict() for key, value in loss_dict.items(): - update_eval_values['avg_' + key] = value - update_eval_values['avg_loader_time'] = loader_time - update_eval_values['avg_step_time'] = step_time + update_eval_values["avg_" + key] = value + update_eval_values["avg_loader_time"] = loader_time + update_eval_values["avg_step_time"] = step_time keep_avg.update_values(update_eval_values) # print eval stats @@ -420,13 +447,14 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, 'eval') + figures = plot_results(y_hat, y_G, ap, global_step, "eval") tb_logger.tb_eval_figures(global_step, figures) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) + tb_logger.tb_eval_audios( + global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"] + ) # synthesize a full voice data_loader.return_segments = False @@ -443,7 +471,9 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + eval_data, train_data = load_wav_feat_data( + c.data_path, c.feature_path, c.eval_split_size + ) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -461,17 +491,15 @@ def main(args): # pylint: disable=redefined-outer-name # setup optimizers optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0) - optimizer_disc = RAdam(model_disc.parameters(), - lr=c.lr_disc, - weight_decay=0) + optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0) # schedulers scheduler_gen = None scheduler_disc = None - if 'lr_scheduler_gen' in c: + if "lr_scheduler_gen" in c: scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if 'lr_scheduler_disc' in c: + if "lr_scheduler_disc" in c: scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) @@ -480,47 +508,46 @@ def main(args): # pylint: disable=redefined-outer-name criterion_disc = DiscriminatorLoss(c) if args.restore_path: - checkpoint = torch.load(args.restore_path, map_location='cpu') + checkpoint = torch.load(args.restore_path, map_location="cpu") try: print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint['model']) + model_gen.load_state_dict(checkpoint["model"]) print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint['optimizer']) + optimizer_gen.load_state_dict(checkpoint["optimizer"]) print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint['model_disc']) + model_disc.load_state_dict(checkpoint["model_disc"]) print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint['optimizer_disc']) - if 'scheduler' in checkpoint: + optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) + if "scheduler" in checkpoint: print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint['scheduler']) + scheduler_gen.load_state_dict(checkpoint["scheduler"]) # NOTE: Not sure if necessary scheduler_gen.optimizer = optimizer_gen - if 'scheduler_disc' in checkpoint: + if "scheduler_disc" in checkpoint: print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) + scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) scheduler_disc.optimizer = optimizer_disc except RuntimeError: # retore only matching layers. print(" > Partial model initialization...") model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model'], c) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model_gen.load_state_dict(model_dict) model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c) + model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) model_disc.load_state_dict(model_dict) del model_dict # reset lr if not countinuining training. for group in optimizer_gen.param_groups: - group['lr'] = c.lr_gen + group["lr"] = c.lr_gen for group in optimizer_disc.param_groups: - group['lr'] = c.lr_disc + group["lr"] = c.lr_disc - print(" > Model restored from step %d" % checkpoint['step'], - flush=True) - args.restore_step = checkpoint['step'] + print(" > Model restored from step %d" % checkpoint["step"], flush=True) + args.restore_step = checkpoint["step"] else: args.restore_step = 0 @@ -539,75 +566,92 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model_disc) print(" > Discriminator has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): - best_loss = float('inf') + if "best_loss" not in locals(): + best_loss = float("inf") global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model_gen, criterion_gen, optimizer_gen, - model_disc, criterion_disc, optimizer_disc, - scheduler_gen, scheduler_disc, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, - global_step, epoch) + _, global_step = train( + model_gen, + criterion_gen, + optimizer_gen, + model_disc, + criterion_disc, + optimizer_disc, + scheduler_gen, + scheduler_disc, + ap, + global_step, + epoch, + ) + eval_avg_loss_dict = evaluate( + model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch + ) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict) + best_loss = save_best_model( + target_loss, + best_loss, + model_gen, + optimizer_gen, + scheduler_gen, + model_disc, + optimizer_disc, + scheduler_disc, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + ) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--continue_path', + "--continue_path", type=str, - help= - 'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) + help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', + default="", + required="--config_path" not in sys.argv, + ) parser.add_argument( - '--restore_path', + "--restore_path", type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument('--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') + help="Model file to be restored. Use to finetune a model.", + default="", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in sys.argv, + ) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.", + ) # DISTRUBUTED parser.add_argument( - '--rank', + "--rank", type=int, default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') + help="DISTRIBUTED: process rank for distributed training.", + ) + parser.add_argument( + "--group_id", type=str, default="", help="DISTRIBUTED: process group id." + ) args = parser.parse_args() - if args.continue_path != '': + if args.continue_path != "": args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') + args.config_path = os.path.join(args.continue_path, "config.json") list_of_files = glob.glob( - args.continue_path + - "/*.pth.tar") # * means all if need specific format then *.csv + args.continue_path + "/*.pth.tar" + ) # * means all if need specific format then *.csv latest_model_file = max(list_of_files, key=os.path.getctime) args.restore_path = latest_model_file print(f" > Training continues for {args.restore_path}") @@ -618,11 +662,10 @@ if __name__ == '__main__': _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, - args.debug) + if args.continue_path == "": + OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') + AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") c_logger = ConsoleLogger() @@ -632,16 +675,17 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_config_file(args.config_path, - os.path.join(OUT_PATH, 'config.json'), new_fields) + copy_config_file( + args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') + tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + tb_logger.tb_add_text("model-description", c["run_description"], 0) try: main(args) @@ -654,4 +698,4 @@ if __name__ == '__main__': except Exception: # pylint: disable=broad-except remove_experiment_folder(OUT_PATH) traceback.print_exc() - sys.exit(1) + sys.exit(1) \ No newline at end of file diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py new file mode 100644 index 00000000..2f77ab57 --- /dev/null +++ b/TTS/bin/train_wavernn_vocoder.py @@ -0,0 +1,493 @@ +import argparse +import math +import os +import pickle +import shutil +import sys +import traceback +import time +import glob +import random + +import torch +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + + +from TTS.utils.audio import AudioProcessor +from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.io import copy_config_file, load_config +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.utils.tensorboard_logger import TensorboardLogger +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss +from TTS.vocoder.utils.generic_utils import setup_wavernn +from TTS.utils.training import setup_torch_training_env +from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, +) +from TTS.vocoder.utils.io import save_best_model, save_checkpoint + + +use_cuda, num_gpus = setup_torch_training_env(True, True) + + +def setup_loader(ap, is_val=False, verbose=False): + if is_val and not CONFIG.run_eval: + loader = None + else: + dataset = WaveRNNDataset( + ap=ap, + items=eval_data if is_val else train_data, + seq_len=CONFIG.seq_len, + hop_len=ap.hop_length, + pad=CONFIG.padding, + mode=CONFIG.mode, + is_training=not is_val, + verbose=verbose, + ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=CONFIG.batch_size, + num_workers=CONFIG.num_val_loader_workers + if is_val + else CONFIG.num_loader_workers, + pin_memory=True, + ) + return loader + + +def format_data(data): + # setup input data + x = data[0] + m = data[1] + y = data[2] + + # dispatch data to GPU + if use_cuda: + x = x.cuda(non_blocking=True) + m = m.cuda(non_blocking=True) + y = y.cuda(non_blocking=True) + + return x, m, y + + +def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): + # create train loader + data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) + model.train() + epoch_time = 0 + keep_avg = KeepAverage() + if use_cuda: + batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus)) + else: + batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) + end_time = time.time() + c_logger.print_train_start() + # train loop + print(" > Training", flush=True) + for num_iter, data in enumerate(data_loader): + start_time = time.time() + x, m, y = format_data(data) + loader_time = time.time() - end_time + global_step += 1 + + ################## + # MODEL TRAINING # + ################## + y_hat = model(x, m) + y_hat_vis = y_hat # for visualization + + # y_hat = y_hat.transpose(1, 2) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y = y.float() + y = y.unsqueeze(-1) + # m_scaled, _ = model.upsample(m) + + # compute losses + loss = criterion(y_hat, y) + if loss.item() is None: + raise RuntimeError(" [!] None loss. Exiting ...") + optimizer.zero_grad() + loss.backward() + if CONFIG.grad_clip > 0: + torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip) + + optimizer.step() + if scheduler is not None: + scheduler.step() + + # get the current learning rate + cur_lr = list(optimizer.param_groups)[0]["lr"] + + step_time = time.time() - start_time + epoch_time += step_time + + update_train_values = dict() + loss_dict = dict() + loss_dict["model_loss"] = loss.item() + for key, value in loss_dict.items(): + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time + keep_avg.update_values(update_train_values) + + # print training stats + if global_step % CONFIG.print_step == 0: + log_dict = { + "step_time": [step_time, 2], + "loader_time": [loader_time, 4], + "current_lr": cur_lr, + } + c_logger.print_train_step( + batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) + + # plot step stats + if global_step % 10 == 0: + iter_stats = {"lr": cur_lr, "step_time": step_time} + iter_stats.update(loss_dict) + tb_logger.tb_train_iter_stats(global_step, iter_stats) + + # save checkpoint + if global_step % CONFIG.save_step == 0: + if CONFIG.checkpoint: + # save model + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) + + # synthesize a full voice + wav_path = train_data[random.randrange(0, len(train_data))][0] + wav = ap.load_wav(wav_path) + ground_mel = ap.melspectrogram(wav) + sample_wav = model.generate( + ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) + predict_mel = ap.melspectrogram(sample_wav) + + # Sample audio + tb_logger.tb_train_audios( + global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + # compute spectrograms + figures = { + "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), + "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + } + tb_logger.tb_train_figures(global_step, figures) + end_time = time.time() + + # print epoch stats + c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) + + # Plot Training Epoch Stats + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(keep_avg.avg_values) + tb_logger.tb_train_epoch_stats(global_step, epoch_stats) + # TODO: plot model stats + # if c.tb_model_param_stats: + # tb_logger.tb_model_weights(model, global_step) + return keep_avg.avg_values, global_step + + +@torch.no_grad() +def evaluate(model, criterion, ap, global_step, epoch): + # create train loader + data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) + model.eval() + epoch_time = 0 + keep_avg = KeepAverage() + end_time = time.time() + c_logger.print_eval_start() + with torch.no_grad(): + for num_iter, data in enumerate(data_loader): + start_time = time.time() + # format data + x, m, y = format_data(data) + loader_time = time.time() - end_time + global_step += 1 + + y_hat = model(x, m) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y = y.float() + y = y.unsqueeze(-1) + loss = criterion(y_hat, y) + # Compute avg loss + # if num_gpus > 1: + # loss = reduce_tensor(loss.data, num_gpus) + loss_dict = dict() + loss_dict["model_loss"] = loss.item() + + step_time = time.time() - start_time + epoch_time += step_time + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + update_eval_values["avg_loader_time"] = loader_time + update_eval_values["avg_step_time"] = step_time + keep_avg.update_values(update_eval_values) + + # print eval stats + if CONFIG.print_eval: + c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) + + if epoch > CONFIG.test_delay_epochs: + # synthesize a full voice + wav_path = eval_data[random.randrange(0, len(eval_data))][0] + wav = ap.load_wav(wav_path) + ground_mel = ap.melspectrogram(wav) + sample_wav = model.generate( + ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) + predict_mel = ap.melspectrogram(sample_wav) + + # Sample audio + tb_logger.tb_eval_audios( + global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + # compute spectrograms + figures = { + "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), + "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + } + tb_logger.tb_eval_figures(global_step, figures) + + tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) + return keep_avg.avg_values + + +# FIXME: move args definition/parsing inside of main? +def main(args): # pylint: disable=redefined-outer-name + # pylint: disable=global-variable-undefined + global train_data, eval_data + + print(f" > Loading wavs from: {CONFIG.data_path}") + if CONFIG.feature_path is not None: + print(f" > Loading features from: {CONFIG.feature_path}") + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size + ) + eval_data, train_data = eval_data, train_data + else: + eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) + + # setup audio processor + ap = AudioProcessor(**CONFIG.audio) + + # setup model + model_wavernn = setup_wavernn(CONFIG) + + # define train functions + if CONFIG.mode == "mold": + criterion = discretized_mix_logistic_loss + elif CONFIG.mode == "gauss": + criterion = gaussian_loss + elif isinstance(CONFIG.mode, int): + criterion = torch.nn.CrossEntropyLoss() + + if use_cuda: + model_wavernn.cuda() + if isinstance(CONFIG.mode, int): + criterion.cuda() + + optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + scheduler = None + if "lr_scheduler" in CONFIG: + scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) + scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params) + # slow start for the first 5 epochs + # lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1) + # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + + # restore any checkpoint + if args.restore_path: + checkpoint = torch.load(args.restore_path, map_location="cpu") + try: + print(" > Restoring Model...") + model_wavernn.load_state_dict(checkpoint["model"]) + print(" > Restoring Optimizer...") + optimizer.load_state_dict(checkpoint["optimizer"]) + if "scheduler" in checkpoint: + print(" > Restoring Generator LR Scheduler...") + scheduler.load_state_dict(checkpoint["scheduler"]) + scheduler.optimizer = optimizer + # TODO: fix resetting restored optimizer lr + # optimizer.load_state_dict(checkpoint["optimizer"]) + except RuntimeError: + # retore only matching layers. + print(" > Partial model initialization...") + model_dict = model_wavernn.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) + model_wavernn.load_state_dict(model_dict) + + print(" > Model restored from step %d" % checkpoint["step"], flush=True) + args.restore_step = checkpoint["step"] + else: + args.restore_step = 0 + + # DISTRIBUTED + # if num_gpus > 1: + # model = apply_gradient_allreduce(model) + + num_parameters = count_parameters(model_wavernn) + print(" > Model has {} parameters".format(num_parameters), flush=True) + + if "best_loss" not in locals(): + best_loss = float("inf") + + global_step = args.restore_step + for epoch in range(0, CONFIG.epochs): + c_logger.print_epoch_start(epoch, CONFIG.epochs) + _, global_step = train( + model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch + ) + eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) + c_logger.print_epoch_end(epoch, eval_avg_loss_dict) + target_loss = eval_avg_loss_dict["avg_model_loss"] + best_loss = save_best_model( + target_loss, + best_loss, + model_wavernn, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--continue_path", + type=str, + help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', + default="", + required="--config_path" not in sys.argv, + ) + parser.add_argument( + "--restore_path", + type=str, + help="Model file to be restored. Use to finetune a model.", + default="", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in sys.argv, + ) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.", + ) + + # DISTRUBUTED + parser.add_argument( + "--rank", + type=int, + default=0, + help="DISTRIBUTED: process rank for distributed training.", + ) + parser.add_argument( + "--group_id", type=str, default="", help="DISTRIBUTED: process group id." + ) + args = parser.parse_args() + + if args.continue_path != "": + args.output_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + list_of_files = glob.glob( + args.continue_path + "/*.pth.tar" + ) # * means all if need specific format then *.csv + latest_model_file = max(list_of_files, key=os.path.getctime) + args.restore_path = latest_model_file + print(f" > Training continues for {args.restore_path}") + + # setup output paths and read configs + CONFIG = load_config(args.config_path) + # check_config(c) + _ = os.path.dirname(os.path.realpath(__file__)) + + OUT_PATH = args.continue_path + if args.continue_path == "": + OUT_PATH = create_experiment_folder( + CONFIG.output_path, CONFIG.run_name, args.debug + ) + + AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") + + c_logger = ConsoleLogger() + + if args.rank == 0: + os.makedirs(AUDIO_PATH, exist_ok=True) + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + copy_config_file( + args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + ) + os.chmod(AUDIO_PATH, 0o775) + os.chmod(OUT_PATH, 0o775) + + LOG_DIR = OUT_PATH + tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") + + # write model desc to tensorboard + tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0) + + try: + main(args) + except KeyboardInterrupt: + remove_experiment_folder(OUT_PATH) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(OUT_PATH) + traceback.print_exc() + sys.exit(1) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json new file mode 100644 index 00000000..f7e5d99f --- /dev/null +++ b/TTS/vocoder/configs/wavernn_config.json @@ -0,0 +1,95 @@ +{ + "model": "wavernn", + "run_name": "wavernn_test", + "run_description": "wavernn_test training", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length + "seq_len": 1280, // has to be devideable by hop_length + "mode": "mold", // mold [string], gauss [string], bits [int] + "mulaw": false, // apply mulaw if mode is bits + "padding": 2, // pad the input for resnet to see wider input length + + // DATASET + "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files + "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant) + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "epochs": 10000, // total number of epochs to train. + "warmup_steps": 10, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, // early testing only wastes computation time. + + // OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [200000, 400000, 600000] + }, + "lr": 1e-4, // initial learning rate + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 50, // number of samples for testing + + // PATHS + "output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/" +} + diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index be60c13a..a5365686 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -23,8 +23,12 @@ def load_wav_data(data_path, eval_split_size): def load_wav_feat_data(data_path, feat_path, eval_split_size): - wav_paths = sorted(find_wav_files(data_path)) - feat_paths = sorted(find_feat_files(feat_path)) + wav_paths = find_wav_files(data_path) + feat_paths = find_feat_files(feat_path) + + wav_paths.sort(key=lambda x: Path(x).stem) + feat_paths.sort(key=lambda x: Path(x).stem) + assert len(wav_paths) == len(feat_paths) for wav, feat in zip(wav_paths, feat_paths): wav_name = Path(wav).stem diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 89dc68fb..365d0e11 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -41,6 +41,26 @@ def to_camel(text): text = text.capitalize() return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) +def setup_wavernn(c): + print(" > Model: {}".format(c.model)) + MyModel = importlib.import_module('TTS.vocoder.models.wavernn') + MyModel = getattr(MyModel, "WaveRNN") + model = MyModel( + rnn_dims=512, + fc_dims=512, + mode=c.mode, + mulaw=c.mulaw, + pad=c.padding, + use_aux_net=c.use_aux_net, + use_upsample_net=c.use_upsample_net, + upsample_factors=c.upsample_factors, + feat_dims=80, + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=c.audio['hop_length'], + sample_rate=c.audio['sample_rate']) + return model def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) From 72bd90b497f4e406b3e93ab4a0e77afa2e890e31 Mon Sep 17 00:00:00 2001 From: Alex K Date: Thu, 15 Oct 2020 19:15:53 +0200 Subject: [PATCH 06/16] wavernn stuff... --- TTS/vocoder/datasets/wavernn_dataset.py | 96 +++++ TTS/vocoder/models/wavernn.py | 485 ++++++++++++++++++++++++ TTS/vocoder/utils/distribution.py | 155 ++++++++ 3 files changed, 736 insertions(+) create mode 100644 TTS/vocoder/datasets/wavernn_dataset.py create mode 100644 TTS/vocoder/models/wavernn.py create mode 100644 TTS/vocoder/utils/distribution.py diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py new file mode 100644 index 00000000..b5a7fdad --- /dev/null +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -0,0 +1,96 @@ +import os +import glob +import torch +import numpy as np +from torch.utils.data import Dataset + + +class WaveRNNDataset(Dataset): + """ + WaveRNN Dataset searchs for all the wav files under root path + and converts them to acoustic features on the fly. + """ + + def __init__( + self, + ap, + items, + seq_len, + hop_len, + pad, + mode, + is_training=True, + return_segments=True, + use_cache=False, + verbose=False, + ): + + self.ap = ap + self.item_list = items + self.seq_len = seq_len + self.hop_len = hop_len + self.pad = pad + self.mode = mode + self.is_training = is_training + self.return_segments = return_segments + self.use_cache = use_cache + self.verbose = verbose + + # wav_files = [f"{self.path}wavs/{file}.wav" for file in self.metadata] + # with Pool(4) as pool: + # self.wav_cache = pool.map(self.ap.load_wav, wav_files) + + def __len__(self): + return len(self.item_list) + + def __getitem__(self, index): + item = self.load_item(index) + return item + + def load_item(self, index): + wavpath, feat_path = self.item_list[index] + m = np.load(feat_path.replace("/quant/", "/mel/")) + # x = self.wav_cache[index] + if 5 > m.shape[-1]: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + feat_path = self.item_list[index] + m = np.load(feat_path.replace("/quant/", "/mel/")) + if self.mode in ["gauss", "mold"]: + x = self.ap.load_wav(wavpath) + elif isinstance(self.mode, int): + x = np.load(feat_path.replace("/mel/", "/quant/")) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + return m, x + + def collate(self, batch): + mel_win = self.seq_len // self.hop_len + 2 * self.pad + max_offsets = [x[0].shape[-1] - (mel_win + 2 * self.pad) for x in batch] + mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] + sig_offsets = [(offset + self.pad) * self.hop_len for offset in mel_offsets] + + mels = [ + x[0][:, mel_offsets[i] : mel_offsets[i] + mel_win] + for i, x in enumerate(batch) + ] + + coarse = [ + x[1][sig_offsets[i] : sig_offsets[i] + self.seq_len + 1] + for i, x in enumerate(batch) + ] + + mels = np.stack(mels).astype(np.float32) + if self.mode in ["gauss", "mold"]: + coarse = np.stack(coarse).astype(np.float32) + coarse = torch.FloatTensor(coarse) + x_input = coarse[:, : self.seq_len] + elif isinstance(self.mode, int): + coarse = np.stack(coarse).astype(np.int64) + coarse = torch.LongTensor(coarse) + x_input = ( + 2 * coarse[:, : self.seq_len].float() / (2 ** self.mode - 1.0) - 1.0 + ) + y_coarse = coarse[:, 1:] + mels = torch.FloatTensor(mels) + return x_input, mels, y_coarse diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py new file mode 100644 index 00000000..e1c4365f --- /dev/null +++ b/TTS/vocoder/models/wavernn.py @@ -0,0 +1,485 @@ +import sys +import torch +import torch.nn as nn +import numpy as np +import torch.nn.functional as F +import time + +# fix this +from TTS.utils.audio import AudioProcessor as ap +from TTS.vocoder.utils.distribution import ( + sample_from_gaussian, + sample_from_discretized_mix_logistic, +) + + +def stream(string, variables): + sys.stdout.write(f"\r{string}" % variables) + + +class ResBlock(nn.Module): + def __init__(self, dims): + super().__init__() + self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False) + self.batch_norm1 = nn.BatchNorm1d(dims) + self.batch_norm2 = nn.BatchNorm1d(dims) + + def forward(self, x): + residual = x + x = self.conv1(x) + x = self.batch_norm1(x) + x = F.relu(x) + x = self.conv2(x) + x = self.batch_norm2(x) + return x + residual + + +class MelResNet(nn.Module): + def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): + super().__init__() + k_size = pad * 2 + 1 + self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False) + self.batch_norm = nn.BatchNorm1d(compute_dims) + self.layers = nn.ModuleList() + for i in range(res_blocks): + self.layers.append(ResBlock(compute_dims)) + self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) + + def forward(self, x): + x = self.conv_in(x) + x = self.batch_norm(x) + x = F.relu(x) + for f in self.layers: + x = f(x) + x = self.conv_out(x) + return x + + +class Stretch2d(nn.Module): + def __init__(self, x_scale, y_scale): + super().__init__() + self.x_scale = x_scale + self.y_scale = y_scale + + def forward(self, x): + b, c, h, w = x.size() + x = x.unsqueeze(-1).unsqueeze(3) + x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale) + return x.view(b, c, h * self.y_scale, w * self.x_scale) + + +class UpsampleNetwork(nn.Module): + def __init__( + self, + feat_dims, + upsample_scales, + compute_dims, + res_blocks, + res_out_dims, + pad, + use_aux_net, + ): + super().__init__() + self.total_scale = np.cumproduct(upsample_scales)[-1] + self.indent = pad * self.total_scale + self.use_aux_net = use_aux_net + if use_aux_net: + self.resnet = MelResNet( + res_blocks, feat_dims, compute_dims, res_out_dims, pad + ) + self.resnet_stretch = Stretch2d(self.total_scale, 1) + self.up_layers = nn.ModuleList() + for scale in upsample_scales: + k_size = (1, scale * 2 + 1) + padding = (0, scale) + stretch = Stretch2d(scale, 1) + conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False) + conv.weight.data.fill_(1.0 / k_size[1]) + self.up_layers.append(stretch) + self.up_layers.append(conv) + + def forward(self, m): + if self.use_aux_net: + aux = self.resnet(m).unsqueeze(1) + aux = self.resnet_stretch(aux) + aux = aux.squeeze(1) + aux = aux.transpose(1, 2) + else: + aux = None + m = m.unsqueeze(1) + for f in self.up_layers: + m = f(m) + m = m.squeeze(1)[:, :, self.indent : -self.indent] + return m.transpose(1, 2), aux + + +class Upsample(nn.Module): + def __init__( + self, scale, pad, res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net + ): + super().__init__() + self.scale = scale + self.pad = pad + self.indent = pad * scale + self.use_aux_net = use_aux_net + self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad) + + def forward(self, m): + if self.use_aux_net: + aux = self.resnet(m) + aux = torch.nn.functional.interpolate( + aux, scale_factor=self.scale, mode="linear", align_corners=True + ) + aux = aux.transpose(1, 2) + else: + aux = None + m = torch.nn.functional.interpolate( + m, scale_factor=self.scale, mode="linear", align_corners=True + ) + m = m[:, :, self.indent : -self.indent] + m = m * 0.045 # empirically found + + return m.transpose(1, 2), aux + + +class WaveRNN(nn.Module): + def __init__( + self, + rnn_dims, + fc_dims, + mode, + mulaw, + pad, + use_aux_net, + use_upsample_net, + upsample_factors, + feat_dims, + compute_dims, + res_out_dims, + res_blocks, + hop_length, + sample_rate, + ): + super().__init__() + self.mode = mode + self.mulaw = mulaw + self.pad = pad + self.use_upsample_net = use_upsample_net + self.use_aux_net = use_aux_net + if isinstance(self.mode, int): + self.n_classes = 2 ** self.mode + elif self.mode == "mold": + self.n_classes = 3 * 10 + elif self.mode == "gauss": + self.n_classes = 2 + else: + raise RuntimeError(" > Unknown training mode") + + self.rnn_dims = rnn_dims + self.aux_dims = res_out_dims // 4 + self.hop_length = hop_length + self.sample_rate = sample_rate + + if self.use_upsample_net: + assert ( + np.cumproduct(upsample_factors)[-1] == self.hop_length + ), " [!] upsample scales needs to be equal to hop_length" + self.upsample = UpsampleNetwork( + feat_dims, + upsample_factors, + compute_dims, + res_blocks, + res_out_dims, + pad, + use_aux_net, + ) + else: + self.upsample = Upsample( + hop_length, + pad, + res_blocks, + feat_dims, + compute_dims, + res_out_dims, + use_aux_net, + ) + if self.use_aux_net: + self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims) + self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True) + self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) + self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) + self.fc3 = nn.Linear(fc_dims, self.n_classes) + else: + self.I = nn.Linear(feat_dims + 1, rnn_dims) + self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) + self.fc1 = nn.Linear(rnn_dims, fc_dims) + self.fc2 = nn.Linear(fc_dims, fc_dims) + self.fc3 = nn.Linear(fc_dims, self.n_classes) + + def forward(self, x, mels): + bsize = x.size(0) + h1 = torch.zeros(1, bsize, self.rnn_dims).cuda() + h2 = torch.zeros(1, bsize, self.rnn_dims).cuda() + mels, aux = self.upsample(mels) + + if self.use_aux_net: + aux_idx = [self.aux_dims * i for i in range(5)] + a1 = aux[:, :, aux_idx[0] : aux_idx[1]] + a2 = aux[:, :, aux_idx[1] : aux_idx[2]] + a3 = aux[:, :, aux_idx[2] : aux_idx[3]] + a4 = aux[:, :, aux_idx[3] : aux_idx[4]] + + x = ( + torch.cat([x.unsqueeze(-1), mels, a1], dim=2) + if self.use_aux_net + else torch.cat([x.unsqueeze(-1), mels], dim=2) + ) + x = self.I(x) + res = x + self.rnn1.flatten_parameters() + x, _ = self.rnn1(x, h1) + + x = x + res + res = x + x = torch.cat([x, a2], dim=2) if self.use_aux_net else x + self.rnn2.flatten_parameters() + x, _ = self.rnn2(x, h2) + + x = x + res + x = torch.cat([x, a3], dim=2) if self.use_aux_net else x + x = F.relu(self.fc1(x)) + + x = torch.cat([x, a4], dim=2) if self.use_aux_net else x + x = F.relu(self.fc2(x)) + return self.fc3(x) + + def generate(self, mels, batched, target, overlap): + + self.eval() + output = [] + start = time.time() + rnn1 = self.get_gru_cell(self.rnn1) + rnn2 = self.get_gru_cell(self.rnn2) + + with torch.no_grad(): + + mels = torch.FloatTensor(mels).cuda().unsqueeze(0) + wave_len = (mels.size(-1) - 1) * self.hop_length + mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both") + mels, aux = self.upsample(mels.transpose(1, 2)) + + if batched: + mels = self.fold_with_overlap(mels, target, overlap) + if aux is not None: + aux = self.fold_with_overlap(aux, target, overlap) + + b_size, seq_len, _ = mels.size() + + h1 = torch.zeros(b_size, self.rnn_dims).cuda() + h2 = torch.zeros(b_size, self.rnn_dims).cuda() + x = torch.zeros(b_size, 1).cuda() + + if self.use_aux_net: + d = self.aux_dims + aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] + + for i in range(seq_len): + + m_t = mels[:, i, :] + + if self.use_aux_net: + a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) + + x = ( + torch.cat([x, m_t, a1_t], dim=1) + if self.use_aux_net + else torch.cat([x, m_t], dim=1) + ) + x = self.I(x) + h1 = rnn1(x, h1) + + x = x + h1 + inp = torch.cat([x, a2_t], dim=1) if self.use_aux_net else x + h2 = rnn2(inp, h2) + + x = x + h2 + x = torch.cat([x, a3_t], dim=1) if self.use_aux_net else x + x = F.relu(self.fc1(x)) + + x = torch.cat([x, a4_t], dim=1) if self.use_aux_net else x + x = F.relu(self.fc2(x)) + + logits = self.fc3(x) + + if self.mode == "mold": + sample = sample_from_discretized_mix_logistic( + logits.unsqueeze(0).transpose(1, 2) + ) + output.append(sample.view(-1)) + x = sample.transpose(0, 1).cuda() + elif self.mode == "gauss": + sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) + output.append(sample.view(-1)) + x = sample.transpose(0, 1).cuda() + elif isinstance(self.mode, int): + posterior = F.softmax(logits, dim=1) + distrib = torch.distributions.Categorical(posterior) + + sample = 2 * distrib.sample().float() / (self.n_classes - 1.0) - 1.0 + output.append(sample) + x = sample.unsqueeze(-1) + else: + raise RuntimeError("Unknown model mode value - ", self.mode) + + if i % 100 == 0: + self.gen_display(i, seq_len, b_size, start) + + output = torch.stack(output).transpose(0, 1) + output = output.cpu().numpy() + output = output.astype(np.float64) + + if batched: + output = self.xfade_and_unfold(output, target, overlap) + else: + output = output[0] + + if self.mulaw and isinstance(self.mode, int): + output = ap.mulaw_decode(output, self.mode) + + # Fade-out at the end to avoid signal cutting out suddenly + fade_out = np.linspace(1, 0, 20 * self.hop_length) + output = output[:wave_len] + output[-20 * self.hop_length :] *= fade_out + + self.train() + return output + + def gen_display(self, i, seq_len, b_size, start): + gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 + realtime_ratio = gen_rate * 1000 / self.sample_rate + stream( + "%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ", + (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), + ) + + def get_gru_cell(self, gru): + gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) + gru_cell.weight_hh.data = gru.weight_hh_l0.data + gru_cell.weight_ih.data = gru.weight_ih_l0.data + gru_cell.bias_hh.data = gru.bias_hh_l0.data + gru_cell.bias_ih.data = gru.bias_ih_l0.data + return gru_cell + + def pad_tensor(self, x, pad, side="both"): + # NB - this is just a quick method i need right now + # i.e., it won't generalise to other shapes/dims + b, t, c = x.size() + total = t + 2 * pad if side == "both" else t + pad + padded = torch.zeros(b, total, c).cuda() + if side == "before" or side == "both": + padded[:, pad : pad + t, :] = x + elif side == "after": + padded[:, :t, :] = x + return padded + + def fold_with_overlap(self, x, target, overlap): + + """Fold the tensor with overlap for quick batched inference. + Overlap will be used for crossfading in xfade_and_unfold() + Args: + x (tensor) : Upsampled conditioning features. + shape=(1, timesteps, features) + target (int) : Target timesteps for each index of batch + overlap (int) : Timesteps for both xfade and rnn warmup + Return: + (tensor) : shape=(num_folds, target + 2 * overlap, features) + Details: + x = [[h1, h2, ... hn]] + Where each h is a vector of conditioning features + Eg: target=2, overlap=1 with x.size(1)=10 + folded = [[h1, h2, h3, h4], + [h4, h5, h6, h7], + [h7, h8, h9, h10]] + """ + + _, total_len, features = x.size() + + # Calculate variables needed + num_folds = (total_len - overlap) // (target + overlap) + extended_len = num_folds * (overlap + target) + overlap + remaining = total_len - extended_len + + # Pad if some time steps poking out + if remaining != 0: + num_folds += 1 + padding = target + 2 * overlap - remaining + x = self.pad_tensor(x, padding, side="after") + + folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + + # Get the values for the folded tensor + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + folded[i] = x[:, start:end, :] + + return folded + + def xfade_and_unfold(self, y, target, overlap): + + """Applies a crossfade and unfolds into a 1d array. + Args: + y (ndarry) : Batched sequences of audio samples + shape=(num_folds, target + 2 * overlap) + dtype=np.float64 + overlap (int) : Timesteps for both xfade and rnn warmup + Return: + (ndarry) : audio samples in a 1d array + shape=(total_len) + dtype=np.float64 + Details: + y = [[seq1], + [seq2], + [seq3]] + Apply a gain envelope at both ends of the sequences + y = [[seq1_in, seq1_target, seq1_out], + [seq2_in, seq2_target, seq2_out], + [seq3_in, seq3_target, seq3_out]] + Stagger and add up the groups of samples: + [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...] + """ + + num_folds, length = y.shape + target = length - 2 * overlap + total_len = num_folds * (target + overlap) + overlap + + # Need some silence for the rnn warmup + silence_len = overlap // 2 + fade_len = overlap - silence_len + silence = np.zeros((silence_len), dtype=np.float64) + + # Equal power crossfade + t = np.linspace(-1, 1, fade_len, dtype=np.float64) + fade_in = np.sqrt(0.5 * (1 + t)) + fade_out = np.sqrt(0.5 * (1 - t)) + + # Concat the silence to the fades + fade_in = np.concatenate([silence, fade_in]) + fade_out = np.concatenate([fade_out, silence]) + + # Apply the gain to the overlap samples + y[:, :overlap] *= fade_in + y[:, -overlap:] *= fade_out + + unfolded = np.zeros((total_len), dtype=np.float64) + + # Loop to add up all the samples + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + unfolded[start:end] += y[i] + + return unfolded diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py new file mode 100644 index 00000000..bfcbdd3f --- /dev/null +++ b/TTS/vocoder/utils/distribution.py @@ -0,0 +1,155 @@ +import numpy as np +import math +import torch +from torch.distributions.normal import Normal +import torch.nn.functional as F + + +def gaussian_loss(y_hat, y, log_std_min=-7.0): + assert y_hat.dim() == 3 + assert y_hat.size(2) == 2 + mean = y_hat[:, :, :1] + log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) + # TODO: replace with pytorch dist + log_probs = -0.5 * (- math.log(2.0 * math.pi) - 2. * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std))) + return log_probs.squeeze().mean() + + +def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0): + assert y_hat.size(2) == 2 + mean = y_hat[:, :, :1] + log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) + dist = Normal(mean, torch.exp(log_std), ) + sample = dist.sample() + sample = torch.clamp(torch.clamp(sample, min=-scale_factor), max=scale_factor) + del dist + return sample + + +def log_sum_exp(x): + """ numerically stable log_sum_exp implementation that prevents overflow """ + # TF ordering + axis = len(x.size()) - 1 + m, _ = torch.max(x, dim=axis) + m2, _ = torch.max(x, dim=axis, keepdim=True) + return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis)) + + +# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py +def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, + log_scale_min=None, reduce=True): + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + y_hat = y_hat.permute(0,2,1) + assert y_hat.dim() == 3 + assert y_hat.size(1) % 3 == 0 + nr_mix = y_hat.size(1) // 3 + + # (B x T x C) + y_hat = y_hat.transpose(1, 2) + + # unpack parameters. (B, T, num_mixtures) x 3 + logit_probs = y_hat[:, :, :nr_mix] + means = y_hat[:, :, nr_mix:2 * nr_mix] + log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) + + # B x T x 1 -> B x T x num_mixtures + y = y.expand_as(means) + + centered_y = y - means + inv_stdv = torch.exp(-log_scales) + plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) + cdf_plus = torch.sigmoid(plus_in) + min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) + cdf_min = torch.sigmoid(min_in) + + # log probability for edge case of 0 (before scaling) + # equivalent: torch.log(F.sigmoid(plus_in)) + log_cdf_plus = plus_in - F.softplus(plus_in) + + # log probability for edge case of 255 (before scaling) + # equivalent: (1 - F.sigmoid(min_in)).log() + log_one_minus_cdf_min = -F.softplus(min_in) + + # probability for all other cases + cdf_delta = cdf_plus - cdf_min + + mid_in = inv_stdv * centered_y + # log probability in the center of the bin, to be used in extreme cases + # (not actually used in our code) + log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) + + # tf equivalent + """ + log_probs = tf.where(x < -0.999, log_cdf_plus, + tf.where(x > 0.999, log_one_minus_cdf_min, + tf.where(cdf_delta > 1e-5, + tf.log(tf.maximum(cdf_delta, 1e-12)), + log_pdf_mid - np.log(127.5)))) + """ + # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value + # for num_classes=65536 case? 1e-7? not sure.. + inner_inner_cond = (cdf_delta > 1e-5).float() + + inner_inner_out = inner_inner_cond * \ + torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ + (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) + inner_cond = (y > 0.999).float() + inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out + cond = (y < -0.999).float() + log_probs = cond * log_cdf_plus + (1. - cond) * inner_out + + log_probs = log_probs + F.log_softmax(logit_probs, -1) + + if reduce: + return -torch.mean(log_sum_exp(log_probs)) + else: + return -log_sum_exp(log_probs).unsqueeze(-1) + + +def sample_from_discretized_mix_logistic(y, log_scale_min=None): + """ + Sample from discretized mixture of logistic distributions + Args: + y (Tensor): B x C x T + log_scale_min (float): Log scale minimum value + Returns: + Tensor: sample in range of [-1, 1]. + """ + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + assert y.size(1) % 3 == 0 + nr_mix = y.size(1) // 3 + + # B x T x C + y = y.transpose(1, 2) + logit_probs = y[:, :, :nr_mix] + + # sample mixture indicator from softmax + temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) + temp = logit_probs.data - torch.log(- torch.log(temp)) + _, argmax = temp.max(dim=-1) + + # (B, T) -> (B, T, nr_mix) + one_hot = to_one_hot(argmax, nr_mix) + # select logistic parameters + means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) + log_scales = torch.clamp(torch.sum( + y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) + # sample from logistic & clip to interval + # we don't actually round to the nearest 8bit value when sampling + u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5) + x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) + + x = torch.clamp(torch.clamp(x, min=-1.), max=1.) + + return x + + +def to_one_hot(tensor, n, fill_with=1.): + # we perform one hot encore with respect to the last axis + one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() + if tensor.is_cuda: + one_hot = one_hot.cuda() + one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) + return one_hot From 9a120f28edbb47f771db0b9e48be03a504e895d3 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Fri, 16 Oct 2020 21:19:51 +0200 Subject: [PATCH 07/16] some minor changes to wavernn --- TTS/bin/train_wavernn_vocoder.py | 31 ++++--- TTS/vocoder/configs/wavernn_config.json | 8 +- TTS/vocoder/datasets/wavernn_dataset.py | 11 +-- TTS/vocoder/utils/generic_utils.py | 112 +++++++++++++----------- 4 files changed, 82 insertions(+), 80 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 2f77ab57..e2b8057e 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -13,17 +13,13 @@ import torch from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler - -from TTS.utils.audio import AudioProcessor from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.audio import AudioProcessor +from TTS.utils.radam import RAdam from TTS.utils.io import copy_config_file, load_config -from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -from TTS.utils.tensorboard_logger import TensorboardLogger -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss -from TTS.vocoder.utils.generic_utils import setup_wavernn from TTS.utils.training import setup_torch_training_env from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.generic_utils import ( KeepAverage, count_parameters, @@ -32,6 +28,10 @@ from TTS.utils.generic_utils import ( remove_experiment_folder, set_init_dict, ) +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss +from TTS.vocoder.utils.generic_utils import setup_wavernn from TTS.vocoder.utils.io import save_best_model, save_checkpoint @@ -105,9 +105,7 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): # MODEL TRAINING # ################## y_hat = model(x, m) - y_hat_vis = y_hat # for visualization - # y_hat = y_hat.transpose(1, 2) if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: @@ -200,8 +198,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) # compute spectrograms figures = { - "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), - "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + "prediction": plot_spectrogram(predict_mel.T), + "ground_truth": plot_spectrogram(ground_mel.T), } tb_logger.tb_train_figures(global_step, figures) end_time = time.time() @@ -237,6 +235,7 @@ def evaluate(model, criterion, ap, global_step, epoch): global_step += 1 y_hat = model(x, m) + y_hat_viz = y_hat # for vizualization if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: @@ -266,7 +265,7 @@ def evaluate(model, criterion, ap, global_step, epoch): if epoch > CONFIG.test_delay_epochs: # synthesize a full voice - wav_path = eval_data[random.randrange(0, len(eval_data))][0] + wav_path = train_data[random.randrange(0, len(train_data))][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) sample_wav = model.generate( @@ -283,8 +282,8 @@ def evaluate(model, criterion, ap, global_step, epoch): ) # compute spectrograms figures = { - "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), - "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + "eval/prediction": plot_spectrogram(predict_mel.T), + "eval/ground_truth": plot_spectrogram(ground_mel.T), } tb_logger.tb_eval_figures(global_step, figures) @@ -303,7 +302,6 @@ def main(args): # pylint: disable=redefined-outer-name eval_data, train_data = load_wav_feat_data( CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size ) - eval_data, train_data = eval_data, train_data else: eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) @@ -326,7 +324,8 @@ def main(args): # pylint: disable=redefined-outer-name if isinstance(CONFIG.mode, int): criterion.cuda() - optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + optimizer = RAdam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + scheduler = None if "lr_scheduler" in CONFIG: scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index f7e5d99f..67503aef 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,5 +1,4 @@ { - "model": "wavernn", "run_name": "wavernn_test", "run_description": "wavernn_test training", @@ -54,13 +53,14 @@ "mode": "mold", // mold [string], gauss [string], bits [int] "mulaw": false, // apply mulaw if mode is bits "padding": 2, // pad the input for resnet to see wider input length - + // DATASET + "use_gta": true, // use computed gta features from the tts model "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files - "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant) + "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing computed features .npy (mels / quant) // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. "epochs": 10000, // total number of epochs to train. "warmup_steps": 10, diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index b5a7fdad..8faf5f3c 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -7,8 +7,7 @@ from torch.utils.data import Dataset class WaveRNNDataset(Dataset): """ - WaveRNN Dataset searchs for all the wav files under root path - and converts them to acoustic features on the fly. + WaveRNN Dataset searchs for all the wav files under root path. """ def __init__( @@ -20,8 +19,6 @@ class WaveRNNDataset(Dataset): pad, mode, is_training=True, - return_segments=True, - use_cache=False, verbose=False, ): @@ -32,14 +29,8 @@ class WaveRNNDataset(Dataset): self.pad = pad self.mode = mode self.is_training = is_training - self.return_segments = return_segments - self.use_cache = use_cache self.verbose = verbose - # wav_files = [f"{self.path}wavs/{file}.wav" for file in self.metadata] - # with Pool(4) as pool: - # self.wav_cache = pool.map(self.ap.load_wav, wav_files) - def __len__(self): return len(self.item_list) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 365d0e11..c73c5248 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -39,11 +39,12 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): def to_camel(text): text = text.capitalize() - return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) + return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + def setup_wavernn(c): - print(" > Model: {}".format(c.model)) - MyModel = importlib.import_module('TTS.vocoder.models.wavernn') + print(" > Model: WaveRNN") + MyModel = importlib.import_module("TTS.vocoder.models.wavernn") MyModel = getattr(MyModel, "WaveRNN") model = MyModel( rnn_dims=512, @@ -58,98 +59,109 @@ def setup_wavernn(c): compute_dims=128, res_out_dims=128, res_blocks=10, - hop_length=c.audio['hop_length'], - sample_rate=c.audio['sample_rate']) + hop_length=c.audio["hop_length"], + sample_rate=c.audio["sample_rate"], + ) return model + def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module('TTS.vocoder.models.' + - c.generator_model.lower()) + MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in 'melgan_generator': + if c.generator_model in "melgan_generator": model = MyModel( - in_channels=c.audio['num_mels'], + in_channels=c.audio["num_mels"], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params['upsample_factors'], + upsample_factors=c.generator_model_params["upsample_factors"], res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'melgan_fb_generator': + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + if c.generator_model in "melgan_fb_generator": pass - if c.generator_model in 'multiband_melgan_generator': + if c.generator_model in "multiband_melgan_generator": model = MyModel( - in_channels=c.audio['num_mels'], + in_channels=c.audio["num_mels"], out_channels=4, proj_kernel=7, base_channels=384, - upsample_factors=c.generator_model_params['upsample_factors'], + upsample_factors=c.generator_model_params["upsample_factors"], res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'fullband_melgan_generator': + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + if c.generator_model in "fullband_melgan_generator": model = MyModel( - in_channels=c.audio['num_mels'], + in_channels=c.audio["num_mels"], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params['upsample_factors'], + upsample_factors=c.generator_model_params["upsample_factors"], res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'parallel_wavegan_generator': + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + if c.generator_model in "parallel_wavegan_generator": model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_res_blocks=c.generator_model_params['num_res_blocks'], - stacks=c.generator_model_params['stacks'], + num_res_blocks=c.generator_model_params["num_res_blocks"], + stacks=c.generator_model_params["stacks"], res_channels=64, gate_channels=128, skip_channels=64, - aux_channels=c.audio['num_mels'], + aux_channels=c.audio["num_mels"], dropout=0.0, bias=True, use_weight_norm=True, - upsample_factors=c.generator_model_params['upsample_factors']) + upsample_factors=c.generator_model_params["upsample_factors"], + ) return model def setup_discriminator(c): print(" > Discriminator Model: {}".format(c.discriminator_model)) - if 'parallel_wavegan' in c.discriminator_model: + if "parallel_wavegan" in c.discriminator_model: MyModel = importlib.import_module( - 'TTS.vocoder.models.parallel_wavegan_discriminator') + "TTS.vocoder.models.parallel_wavegan_discriminator" + ) else: - MyModel = importlib.import_module('TTS.vocoder.models.' + - c.discriminator_model.lower()) + MyModel = importlib.import_module( + "TTS.vocoder.models." + c.discriminator_model.lower() + ) MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in 'random_window_discriminator': + if c.discriminator_model in "random_window_discriminator": model = MyModel( - cond_channels=c.audio['num_mels'], - hop_length=c.audio['hop_length'], - uncond_disc_donwsample_factors=c. - discriminator_model_params['uncond_disc_donwsample_factors'], - cond_disc_downsample_factors=c. - discriminator_model_params['cond_disc_downsample_factors'], - cond_disc_out_channels=c. - discriminator_model_params['cond_disc_out_channels'], - window_sizes=c.discriminator_model_params['window_sizes']) - if c.discriminator_model in 'melgan_multiscale_discriminator': + cond_channels=c.audio["num_mels"], + hop_length=c.audio["hop_length"], + uncond_disc_donwsample_factors=c.discriminator_model_params[ + "uncond_disc_donwsample_factors" + ], + cond_disc_downsample_factors=c.discriminator_model_params[ + "cond_disc_downsample_factors" + ], + cond_disc_out_channels=c.discriminator_model_params[ + "cond_disc_out_channels" + ], + window_sizes=c.discriminator_model_params["window_sizes"], + ) + if c.discriminator_model in "melgan_multiscale_discriminator": model = MyModel( in_channels=1, out_channels=1, kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params['base_channels'], - max_channels=c.discriminator_model_params['max_channels'], - downsample_factors=c. - discriminator_model_params['downsample_factors']) - if c.discriminator_model == 'residual_parallel_wavegan_discriminator': + base_channels=c.discriminator_model_params["base_channels"], + max_channels=c.discriminator_model_params["max_channels"], + downsample_factors=c.discriminator_model_params["downsample_factors"], + ) + if c.discriminator_model == "residual_parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params['num_layers'], - stacks=c.discriminator_model_params['stacks'], + num_layers=c.discriminator_model_params["num_layers"], + stacks=c.discriminator_model_params["stacks"], res_channels=64, gate_channels=128, skip_channels=64, @@ -158,17 +170,17 @@ def setup_discriminator(c): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, ) - if c.discriminator_model == 'parallel_wavegan_discriminator': + if c.discriminator_model == "parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params['num_layers'], + num_layers=c.discriminator_model_params["num_layers"], conv_channels=64, dilation_factor=1, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, - bias=True + bias=True, ) return model From 995d84f6d74fa3592fcb7cd5b31f9246155191a8 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Mon, 19 Oct 2020 14:37:30 +0200 Subject: [PATCH 08/16] added feature preprocessing if not set in config --- TTS/bin/train_wavernn_vocoder.py | 64 ++++++++++++++++--------- TTS/vocoder/configs/wavernn_config.json | 11 ++--- TTS/vocoder/datasets/preprocess.py | 25 +++++++++- TTS/vocoder/datasets/wavernn_dataset.py | 1 + 4 files changed, 71 insertions(+), 30 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index e2b8057e..533fe0ce 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -29,7 +29,12 @@ from TTS.utils.generic_utils import ( set_init_dict, ) from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.datasets.preprocess import ( + load_wav_data, + find_feat_files, + load_wav_feat_data, + preprocess_wav_files, +) from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss from TTS.vocoder.utils.generic_utils import setup_wavernn from TTS.vocoder.utils.io import save_best_model, save_checkpoint @@ -192,15 +197,17 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) predict_mel = ap.melspectrogram(sample_wav) - # Sample audio - tb_logger.tb_train_audios( - global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] - ) # compute spectrograms figures = { - "prediction": plot_spectrogram(predict_mel.T), - "ground_truth": plot_spectrogram(ground_mel.T), + "train/ground_truth": plot_spectrogram(ground_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T), } + + # Sample audio + tb_logger.tb_train_audios( + global_step, {"train/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + tb_logger.tb_train_figures(global_step, figures) end_time = time.time() @@ -235,7 +242,6 @@ def evaluate(model, criterion, ap, global_step, epoch): global_step += 1 y_hat = model(x, m) - y_hat_viz = y_hat # for vizualization if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: @@ -263,11 +269,11 @@ def evaluate(model, criterion, ap, global_step, epoch): if CONFIG.print_eval: c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - if epoch > CONFIG.test_delay_epochs: - # synthesize a full voice - wav_path = train_data[random.randrange(0, len(train_data))][0] + if epoch % CONFIG.test_every_epochs == 0: + # synthesize a part of data + wav_path = eval_data[random.randrange(0, len(eval_data))][0] wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav) + ground_mel = ap.melspectrogram(wav[:22000]) sample_wav = model.generate( ground_mel, CONFIG.batched, @@ -276,15 +282,17 @@ def evaluate(model, criterion, ap, global_step, epoch): ) predict_mel = ap.melspectrogram(sample_wav) + # compute spectrograms + figures = { + "eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T), + } + # Sample audio tb_logger.tb_eval_audios( global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] ) - # compute spectrograms - figures = { - "eval/prediction": plot_spectrogram(predict_mel.T), - "eval/ground_truth": plot_spectrogram(ground_mel.T), - } + tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -296,6 +304,9 @@ def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global train_data, eval_data + # setup audio processor + ap = AudioProcessor(**CONFIG.audio) + print(f" > Loading wavs from: {CONFIG.data_path}") if CONFIG.feature_path is not None: print(f" > Loading features from: {CONFIG.feature_path}") @@ -303,11 +314,20 @@ def main(args): # pylint: disable=redefined-outer-name CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size ) else: - eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**CONFIG.audio) - + mel_feat_path = os.path.join(OUT_PATH, "mel") + feat_data = find_feat_files(mel_feat_path) + if feat_data: + print(f" > Loading features from: {mel_feat_path}") + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size + ) + else: + print(f" > No feature data found. Preprocessing...") + # preprocessing feature data from given wav files + preprocess_wav_files(OUT_PATH, CONFIG, ap) + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size + ) # setup model model_wavernn = setup_wavernn(CONFIG) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 67503aef..8e6a8c32 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -55,18 +55,17 @@ "padding": 2, // pad the input for resnet to see wider input length // DATASET - "use_gta": true, // use computed gta features from the tts model - "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files - "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing computed features .npy (mels / quant) + //"use_gta": true, // use computed gta features from the tts model + "data_path": "path/to/wav/files", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them // TRAINING "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. "epochs": 10000, // total number of epochs to train. - "warmup_steps": 10, // VALIDATION "run_eval": true, - "test_delay_epochs": 10, // early testing only wastes computation time. + "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) // OPTIMIZER "grad_clip": 4, // apply gradient clipping if > 0 @@ -90,6 +89,6 @@ "eval_split_size": 50, // number of samples for testing // PATHS - "output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/" + "output_path": "output/training/path" } diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index a5365686..afea45fd 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -1,17 +1,38 @@ import glob import os from pathlib import Path +from tqdm import tqdm import numpy as np +def preprocess_wav_files(out_path, config, ap): + os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) + os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) + wav_files = find_wav_files(config.data_path) + for path in tqdm(wav_files): + wav_name = Path(path).stem + quant_path = os.path.join(out_path, "quant", wav_name + ".npy") + mel_path = os.path.join(out_path, "mel", wav_name + ".npy") + y = ap.load_wav(path) + mel = ap.melspectrogram(y) + np.save(mel_path, mel) + if isinstance(config.mode, int): + quant = ( + ap.mulaw_encode(y, qc=config.mode) + if config.mulaw + else ap.quantize(y, bits=config.mode) + ) + np.save(quant_path, quant) + + def find_wav_files(data_path): - wav_paths = glob.glob(os.path.join(data_path, '**', '*.wav'), recursive=True) + wav_paths = glob.glob(os.path.join(data_path, "**", "*.wav"), recursive=True) return wav_paths def find_feat_files(data_path): - feat_paths = glob.glob(os.path.join(data_path, '**', '*.npy'), recursive=True) + feat_paths = glob.glob(os.path.join(data_path, "**", "*.npy"), recursive=True) return feat_paths diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 8faf5f3c..1b0a8077 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -48,6 +48,7 @@ class WaveRNNDataset(Dataset): feat_path = self.item_list[index] m = np.load(feat_path.replace("/quant/", "/mel/")) if self.mode in ["gauss", "mold"]: + # x = np.load(feat_path.replace("/mel/", "/quant/")) x = self.ap.load_wav(wavpath) elif isinstance(self.mode, int): x = np.load(feat_path.replace("/mel/", "/quant/")) From 64adfbf4a59b9bf0aa21fe3effceed332458bf7b Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Mon, 19 Oct 2020 15:38:32 +0200 Subject: [PATCH 09/16] fixing pylint errors --- TTS/bin/train_wavernn_vocoder.py | 9 ++-- TTS/vocoder/datasets/wavernn_dataset.py | 4 +- TTS/vocoder/models/wavernn.py | 10 ++-- TTS/vocoder/utils/distribution.py | 72 ++++++++++++++----------- 4 files changed, 51 insertions(+), 44 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 533fe0ce..78984510 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -1,8 +1,5 @@ import argparse -import math import os -import pickle -import shutil import sys import traceback import time @@ -11,7 +8,8 @@ import random import torch from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler + +# from torch.utils.data.distributed import DistributedSampler from TTS.tts.utils.visual import plot_spectrogram from TTS.utils.audio import AudioProcessor @@ -30,7 +28,6 @@ from TTS.utils.generic_utils import ( ) from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( - load_wav_data, find_feat_files, load_wav_feat_data, preprocess_wav_files, @@ -322,7 +319,7 @@ def main(args): # pylint: disable=redefined-outer-name CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size ) else: - print(f" > No feature data found. Preprocessing...") + print(" > No feature data found. Preprocessing...") # preprocessing feature data from given wav files preprocess_wav_files(OUT_PATH, CONFIG, ap) eval_data, train_data = load_wav_feat_data( diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 1b0a8077..5d5b9f15 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,5 +1,3 @@ -import os -import glob import torch import numpy as np from torch.utils.data import Dataset @@ -42,7 +40,7 @@ class WaveRNNDataset(Dataset): wavpath, feat_path = self.item_list[index] m = np.load(feat_path.replace("/quant/", "/mel/")) # x = self.wav_cache[index] - if 5 > m.shape[-1]: + if m.shape[-1] < 5: print(" [!] Instance is too short! : {}".format(wavpath)) self.item_list[index] = self.item_list[index + 1] feat_path = self.item_list[index] diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index e1c4365f..9b637a6a 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -42,7 +42,7 @@ class MelResNet(nn.Module): self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() - for i in range(res_blocks): + for _ in range(res_blocks): self.layers.append(ResBlock(compute_dims)) self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) @@ -365,7 +365,8 @@ class WaveRNN(nn.Module): (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), ) - def get_gru_cell(self, gru): + @staticmethod + def get_gru_cell(gru): gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) gru_cell.weight_hh.data = gru.weight_hh_l0.data gru_cell.weight_ih.data = gru.weight_ih_l0.data @@ -373,13 +374,14 @@ class WaveRNN(nn.Module): gru_cell.bias_ih.data = gru.bias_ih_l0.data return gru_cell - def pad_tensor(self, x, pad, side="both"): + @staticmethod + def pad_tensor(x, pad, side="both"): # NB - this is just a quick method i need right now # i.e., it won't generalise to other shapes/dims b, t, c = x.size() total = t + 2 * pad if side == "both" else t + pad padded = torch.zeros(b, total, c).cuda() - if side == "before" or side == "both": + if side in ("before", "both"): padded[:, pad : pad + t, :] = x elif side == "after": padded[:, :t, :] = x diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index bfcbdd3f..705c14dc 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -11,7 +11,11 @@ def gaussian_loss(y_hat, y, log_std_min=-7.0): mean = y_hat[:, :, :1] log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) # TODO: replace with pytorch dist - log_probs = -0.5 * (- math.log(2.0 * math.pi) - 2. * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std))) + log_probs = -0.5 * ( + -math.log(2.0 * math.pi) + - 2.0 * log_std + - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std)) + ) return log_probs.squeeze().mean() @@ -19,7 +23,10 @@ def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0): assert y_hat.size(2) == 2 mean = y_hat[:, :, :1] log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) - dist = Normal(mean, torch.exp(log_std), ) + dist = Normal( + mean, + torch.exp(log_std), + ) sample = dist.sample() sample = torch.clamp(torch.clamp(sample, min=-scale_factor), max=scale_factor) del dist @@ -36,11 +43,12 @@ def log_sum_exp(x): # It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py -def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, - log_scale_min=None, reduce=True): +def discretized_mix_logistic_loss( + y_hat, y, num_classes=65536, log_scale_min=None, reduce=True +): if log_scale_min is None: log_scale_min = float(np.log(1e-14)) - y_hat = y_hat.permute(0,2,1) + y_hat = y_hat.permute(0, 2, 1) assert y_hat.dim() == 3 assert y_hat.size(1) % 3 == 0 nr_mix = y_hat.size(1) // 3 @@ -50,17 +58,17 @@ def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, # unpack parameters. (B, T, num_mixtures) x 3 logit_probs = y_hat[:, :, :nr_mix] - means = y_hat[:, :, nr_mix:2 * nr_mix] - log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) + means = y_hat[:, :, nr_mix : 2 * nr_mix] + log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min) # B x T x 1 -> B x T x num_mixtures y = y.expand_as(means) centered_y = y - means inv_stdv = torch.exp(-log_scales) - plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) + plus_in = inv_stdv * (centered_y + 1.0 / (num_classes - 1)) cdf_plus = torch.sigmoid(plus_in) - min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) + min_in = inv_stdv * (centered_y - 1.0 / (num_classes - 1)) cdf_min = torch.sigmoid(min_in) # log probability for edge case of 0 (before scaling) @@ -77,34 +85,35 @@ def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, mid_in = inv_stdv * centered_y # log probability in the center of the bin, to be used in extreme cases # (not actually used in our code) - log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) + log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in) # tf equivalent - """ - log_probs = tf.where(x < -0.999, log_cdf_plus, - tf.where(x > 0.999, log_one_minus_cdf_min, - tf.where(cdf_delta > 1e-5, - tf.log(tf.maximum(cdf_delta, 1e-12)), - log_pdf_mid - np.log(127.5)))) - """ + + # log_probs = tf.where(x < -0.999, log_cdf_plus, + # tf.where(x > 0.999, log_one_minus_cdf_min, + # tf.where(cdf_delta > 1e-5, + # tf.log(tf.maximum(cdf_delta, 1e-12)), + # log_pdf_mid - np.log(127.5)))) + # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value # for num_classes=65536 case? 1e-7? not sure.. inner_inner_cond = (cdf_delta > 1e-5).float() - inner_inner_out = inner_inner_cond * \ - torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ - (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) + inner_inner_out = inner_inner_cond * torch.log( + torch.clamp(cdf_delta, min=1e-12) + ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) inner_cond = (y > 0.999).float() - inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out + inner_out = ( + inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out + ) cond = (y < -0.999).float() - log_probs = cond * log_cdf_plus + (1. - cond) * inner_out + log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out log_probs = log_probs + F.log_softmax(logit_probs, -1) if reduce: return -torch.mean(log_sum_exp(log_probs)) - else: - return -log_sum_exp(log_probs).unsqueeze(-1) + return -log_sum_exp(log_probs).unsqueeze(-1) def sample_from_discretized_mix_logistic(y, log_scale_min=None): @@ -127,26 +136,27 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): # sample mixture indicator from softmax temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) - temp = logit_probs.data - torch.log(- torch.log(temp)) + temp = logit_probs.data - torch.log(-torch.log(temp)) _, argmax = temp.max(dim=-1) # (B, T) -> (B, T, nr_mix) one_hot = to_one_hot(argmax, nr_mix) # select logistic parameters - means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) - log_scales = torch.clamp(torch.sum( - y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) + means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1) + log_scales = torch.clamp( + torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min + ) # sample from logistic & clip to interval # we don't actually round to the nearest 8bit value when sampling u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5) - x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) + x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1.0 - u)) - x = torch.clamp(torch.clamp(x, min=-1.), max=1.) + x = torch.clamp(torch.clamp(x, min=-1.0), max=1.0) return x -def to_one_hot(tensor, n, fill_with=1.): +def to_one_hot(tensor, n, fill_with=1.0): # we perform one hot encore with respect to the last axis one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() if tensor.is_cuda: From 24d18d20e34c331bc0d3a067de66389c98a9b03c Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Mon, 19 Oct 2020 16:20:15 +0200 Subject: [PATCH 10/16] fix formatting + pylint --- TTS/bin/compute_statistics.py | 52 ++--- TTS/bin/train_gan_vocoder.py | 344 +++++++++++++---------------- TTS/vocoder/models/wavernn.py | 47 ++-- TTS/vocoder/utils/distribution.py | 15 +- TTS/vocoder/utils/generic_utils.py | 101 ++++----- 5 files changed, 252 insertions(+), 307 deletions(-) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 9177c75b..ca089d3e 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -15,23 +15,17 @@ from TTS.utils.audio import AudioProcessor def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( - description="Compute mean and variance of spectrogtram features." - ) - parser.add_argument( - "--config_path", - type=str, - required=True, - help="TTS config file path to define audio processin parameters.", - ) - parser.add_argument( - "--out_path", default=None, type=str, help="directory to save the output file." - ) + description="Compute mean and variance of spectrogtram features.") + parser.add_argument("--config_path", type=str, required=True, + help="TTS config file path to define audio processin parameters.") + parser.add_argument("--out_path", default=None, type=str, + help="directory to save the output file.") args = parser.parse_args() # load config CONFIG = load_config(args.config_path) - CONFIG.audio["signal_norm"] = False # do not apply earlier normalization - CONFIG.audio["stats_path"] = None # discard pre-defined stats + CONFIG.audio['signal_norm'] = False # do not apply earlier normalization + CONFIG.audio['stats_path'] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) @@ -65,27 +59,27 @@ def main(): output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} - stats["mel_mean"] = mel_mean - stats["mel_std"] = mel_scale - stats["linear_mean"] = linear_mean - stats["linear_std"] = linear_scale + stats['mel_mean'] = mel_mean + stats['mel_std'] = mel_scale + stats['linear_mean'] = linear_mean + stats['linear_std'] = linear_scale - print(f" > Avg mel spec mean: {mel_mean.mean()}") - print(f" > Avg mel spec scale: {mel_scale.mean()}") - print(f" > Avg linear spec mean: {linear_mean.mean()}") - print(f" > Avg lienar spec scale: {linear_scale.mean()}") + print(f' > Avg mel spec mean: {mel_mean.mean()}') + print(f' > Avg mel spec scale: {mel_scale.mean()}') + print(f' > Avg linear spec mean: {linear_mean.mean()}') + print(f' > Avg lienar spec scale: {linear_scale.mean()}') # set default config values for mean-var scaling - CONFIG.audio["stats_path"] = output_file_path - CONFIG.audio["signal_norm"] = True + CONFIG.audio['stats_path'] = output_file_path + CONFIG.audio['signal_norm'] = True # remove redundant values - del CONFIG.audio["max_norm"] - del CONFIG.audio["min_level_db"] - del CONFIG.audio["symmetric_norm"] - del CONFIG.audio["clip_norm"] - stats["audio_config"] = CONFIG.audio + del CONFIG.audio['max_norm'] + del CONFIG.audio['min_level_db'] + del CONFIG.audio['symmetric_norm'] + del CONFIG.audio['clip_norm'] + stats['audio_config'] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) - print(f" > scale_stats.npy is saved to {output_file_path}") + print(f' > scale_stats.npy is saved to {output_file_path}') if __name__ == "__main__": diff --git a/TTS/bin/train_gan_vocoder.py b/TTS/bin/train_gan_vocoder.py index 7689c930..12edf048 100644 --- a/TTS/bin/train_gan_vocoder.py +++ b/TTS/bin/train_gan_vocoder.py @@ -10,29 +10,20 @@ import torch from torch.utils.data import DataLoader from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.generic_utils import ( - KeepAverage, - count_parameters, - create_experiment_folder, - get_git_branch, - remove_experiment_folder, - set_init_dict, -) +from TTS.utils.generic_utils import (KeepAverage, count_parameters, + create_experiment_folder, get_git_branch, + remove_experiment_folder, set_init_dict) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data - # from distribute import (DistributedSampler, apply_gradient_allreduce, # init_distributed, reduce_tensor) from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import ( - plot_results, - setup_discriminator, - setup_generator, -) +from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, + setup_generator) from TTS.vocoder.utils.io import save_best_model, save_checkpoint use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -42,30 +33,27 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not c.run_eval: loader = None else: - dataset = GANDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose, - ) + dataset = GANDataset(ap=ap, + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=not is_val, + use_noise_augment=c.use_noise_augment, + use_cache=c.use_cache, + verbose=verbose) dataset.shuffle_mapping() # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=True, - drop_last=False, - sampler=None, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=False, - ) + loader = DataLoader(dataset, + batch_size=1 if is_val else c.batch_size, + shuffle=True, + drop_last=False, + sampler=None, + num_workers=c.num_val_loader_workers + if is_val else c.num_loader_workers, + pin_memory=False) return loader @@ -92,26 +80,16 @@ def format_data(data): return co, x, None, None -def train( - model_G, - criterion_G, - optimizer_G, - model_D, - criterion_D, - optimizer_D, - scheduler_G, - scheduler_D, - ap, - global_step, - epoch, -): +def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, + scheduler_G, scheduler_D, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model_G.train() model_D.train() epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) + batch_n_iter = int( + len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() @@ -167,16 +145,16 @@ def train( scores_fake = D_out_fake # compute losses - loss_G_dict = criterion_G( - y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub - ) - loss_G = loss_G_dict["G_loss"] + loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, + feats_real, y_hat_sub, y_G_sub) + loss_G = loss_G_dict['G_loss'] # optimizer generator optimizer_G.zero_grad() loss_G.backward() if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) + torch.nn.utils.clip_grad_norm_(model_G.parameters(), + c.gen_clip_grad) optimizer_G.step() if scheduler_G is not None: scheduler_G.step() @@ -221,13 +199,14 @@ def train( # compute losses loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict["D_loss"] + loss_D = loss_D_dict['D_loss'] # optimizer discriminator optimizer_D.zero_grad() loss_D.backward() if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) + torch.nn.utils.clip_grad_norm_(model_D.parameters(), + c.disc_clip_grad) optimizer_D.step() if scheduler_D is not None: scheduler_D.step() @@ -242,40 +221,34 @@ def train( epoch_time += step_time # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]["lr"] - current_lr_D = list(optimizer_D.param_groups)[0]["lr"] + current_lr_G = list(optimizer_G.param_groups)[0]['lr'] + current_lr_D = list(optimizer_D.param_groups)[0]['lr'] # update avg stats update_train_values = dict() for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time + update_train_values['avg_' + key] = value + update_train_values['avg_loader_time'] = loader_time + update_train_values['avg_step_time'] = step_time keep_avg.update_values(update_train_values) # print training stats if global_step % c.print_step == 0: log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], + 'step_time': [step_time, 2], + 'loader_time': [loader_time, 4], "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D, + "current_lr_D": current_lr_D } - c_logger.print_train_step( - batch_n_iter, - num_iter, - global_step, - log_dict, - loss_dict, - keep_avg.avg_values, - ) + c_logger.print_train_step(batch_n_iter, num_iter, global_step, + log_dict, loss_dict, keep_avg.avg_values) # plot step stats if global_step % 10 == 0: iter_stats = { "lr_G": current_lr_G, "lr_D": current_lr_D, - "step_time": step_time, + "step_time": step_time } iter_stats.update(loss_dict) tb_logger.tb_train_iter_stats(global_step, iter_stats) @@ -284,28 +257,27 @@ def train( if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint( - model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - ) + save_checkpoint(model_G, + optimizer_G, + scheduler_G, + model_D, + optimizer_D, + scheduler_D, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict) # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") + figures = plot_results(y_hat_vis, y_G, ap, global_step, + 'train') tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios( - global_step, {"train/audio": sample_voice}, c.audio["sample_rate"] - ) + tb_logger.tb_train_audios(global_step, + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -379,9 +351,8 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) feats_fake, feats_real = None, None # compute losses - loss_G_dict = criterion_G( - y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub - ) + loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, + feats_real, y_hat_sub, y_G_sub) loss_dict = dict() for key, value in loss_G_dict.items(): @@ -437,9 +408,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) # update avg stats update_eval_values = dict() for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time + update_eval_values['avg_' + key] = value + update_eval_values['avg_loader_time'] = loader_time + update_eval_values['avg_step_time'] = step_time keep_avg.update_values(update_eval_values) # print eval stats @@ -447,14 +418,13 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, "eval") + figures = plot_results(y_hat, y_G, ap, global_step, 'eval') tb_logger.tb_eval_figures(global_step, figures) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios( - global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"] - ) + tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, + c.audio["sample_rate"]) # synthesize a full voice data_loader.return_segments = False @@ -472,8 +442,7 @@ def main(args): # pylint: disable=redefined-outer-name if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") eval_data, train_data = load_wav_feat_data( - c.data_path, c.feature_path, c.eval_split_size - ) + c.data_path, c.feature_path, c.eval_split_size) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -491,63 +460,68 @@ def main(args): # pylint: disable=redefined-outer-name # setup optimizers optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0) - optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0) + optimizer_disc = RAdam(model_disc.parameters(), + lr=c.lr_disc, + weight_decay=0) # schedulers scheduler_gen = None scheduler_disc = None - if "lr_scheduler_gen" in c: + if 'lr_scheduler_gen' in c: scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) - scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if "lr_scheduler_disc" in c: + scheduler_gen = scheduler_gen( + optimizer_gen, **c.lr_scheduler_gen_params) + if 'lr_scheduler_disc' in c: scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) - scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) + scheduler_disc = scheduler_disc( + optimizer_disc, **c.lr_scheduler_disc_params) # setup criterion criterion_gen = GeneratorLoss(c) criterion_disc = DiscriminatorLoss(c) if args.restore_path: - checkpoint = torch.load(args.restore_path, map_location="cpu") + checkpoint = torch.load(args.restore_path, map_location='cpu') try: print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint["model"]) + model_gen.load_state_dict(checkpoint['model']) print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint["optimizer"]) + optimizer_gen.load_state_dict(checkpoint['optimizer']) print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint["model_disc"]) + model_disc.load_state_dict(checkpoint['model_disc']) print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) - if "scheduler" in checkpoint: + optimizer_disc.load_state_dict(checkpoint['optimizer_disc']) + if 'scheduler' in checkpoint: print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint["scheduler"]) + scheduler_gen.load_state_dict(checkpoint['scheduler']) # NOTE: Not sure if necessary scheduler_gen.optimizer = optimizer_gen - if "scheduler_disc" in checkpoint: + if 'scheduler_disc' in checkpoint: print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) + scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) scheduler_disc.optimizer = optimizer_disc except RuntimeError: # retore only matching layers. print(" > Partial model initialization...") model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) + model_dict = set_init_dict(model_dict, checkpoint['model'], c) model_gen.load_state_dict(model_dict) model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) + model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c) model_disc.load_state_dict(model_dict) del model_dict # reset lr if not countinuining training. for group in optimizer_gen.param_groups: - group["lr"] = c.lr_gen + group['lr'] = c.lr_gen for group in optimizer_disc.param_groups: - group["lr"] = c.lr_disc + group['lr'] = c.lr_disc - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] + print(" > Model restored from step %d" % checkpoint['step'], + flush=True) + args.restore_step = checkpoint['step'] else: args.restore_step = 0 @@ -566,92 +540,74 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model_disc) print(" > Discriminator has {} parameters".format(num_params), flush=True) - if "best_loss" not in locals(): - best_loss = float("inf") + if 'best_loss' not in locals(): + best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train( - model_gen, - criterion_gen, - optimizer_gen, - model_disc, - criterion_disc, - optimizer_disc, - scheduler_gen, - scheduler_disc, - ap, - global_step, - epoch, - ) - eval_avg_loss_dict = evaluate( - model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch - ) + _, global_step = train(model_gen, criterion_gen, optimizer_gen, + model_disc, criterion_disc, optimizer_disc, + scheduler_gen, scheduler_disc, ap, global_step, + epoch) + eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict, - ) + best_loss = save_best_model(target_loss, + best_loss, + model_gen, + optimizer_gen, + scheduler_gen, + model_disc, + optimizer_disc, + scheduler_disc, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict) -if __name__ == "__main__": +if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( - "--continue_path", + '--continue_path', type=str, help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default="", - required="--config_path" not in sys.argv, - ) + default='', + required='--config_path' not in sys.argv) parser.add_argument( - "--restore_path", + '--restore_path', type=str, - help="Model file to be restored. Use to finetune a model.", - default="", - ) - parser.add_argument( - "--config_path", - type=str, - help="Path to config file for training.", - required="--continue_path" not in sys.argv, - ) - parser.add_argument( - "--debug", - type=bool, - default=False, - help="Do not verify commit integrity to run training.", - ) + help='Model file to be restored. Use to finetune a model.', + default='') + parser.add_argument('--config_path', + type=str, + help='Path to config file for training.', + required='--continue_path' not in sys.argv) + parser.add_argument('--debug', + type=bool, + default=False, + help='Do not verify commit integrity to run training.') # DISTRUBUTED parser.add_argument( - "--rank", + '--rank', type=int, default=0, - help="DISTRIBUTED: process rank for distributed training.", - ) - parser.add_argument( - "--group_id", type=str, default="", help="DISTRIBUTED: process group id." - ) + help='DISTRIBUTED: process rank for distributed training.') + parser.add_argument('--group_id', + type=str, + default="", + help='DISTRIBUTED: process group id.') args = parser.parse_args() - if args.continue_path != "": + if args.continue_path != '': args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, "config.json") + args.config_path = os.path.join(args.continue_path, 'config.json') list_of_files = glob.glob( - args.continue_path + "/*.pth.tar" - ) # * means all if need specific format then *.csv + args.continue_path + + "/*.pth.tar") # * means all if need specific format then *.csv latest_model_file = max(list_of_files, key=os.path.getctime) args.restore_path = latest_model_file print(f" > Training continues for {args.restore_path}") @@ -662,10 +618,11 @@ if __name__ == "__main__": _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path - if args.continue_path == "": - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) + if args.continue_path == '': + OUT_PATH = create_experiment_folder(c.output_path, c.run_name, + args.debug) - AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") + AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') c_logger = ConsoleLogger() @@ -675,17 +632,16 @@ if __name__ == "__main__": if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_config_file( - args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields - ) + copy_config_file(args.config_path, + os.path.join(OUT_PATH, 'config.json'), new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") + tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') # write model desc to tensorboard - tb_logger.tb_add_text("model-description", c["run_description"], 0) + tb_logger.tb_add_text('model-description', c['run_description'], 0) try: main(args) @@ -698,4 +654,4 @@ if __name__ == "__main__": except Exception: # pylint: disable=broad-except remove_experiment_folder(OUT_PATH) traceback.print_exc() - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 9b637a6a..4d1a633c 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -365,28 +365,6 @@ class WaveRNN(nn.Module): (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), ) - @staticmethod - def get_gru_cell(gru): - gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) - gru_cell.weight_hh.data = gru.weight_hh_l0.data - gru_cell.weight_ih.data = gru.weight_ih_l0.data - gru_cell.bias_hh.data = gru.bias_hh_l0.data - gru_cell.bias_ih.data = gru.bias_ih_l0.data - return gru_cell - - @staticmethod - def pad_tensor(x, pad, side="both"): - # NB - this is just a quick method i need right now - # i.e., it won't generalise to other shapes/dims - b, t, c = x.size() - total = t + 2 * pad if side == "both" else t + pad - padded = torch.zeros(b, total, c).cuda() - if side in ("before", "both"): - padded[:, pad : pad + t, :] = x - elif side == "after": - padded[:, :t, :] = x - return padded - def fold_with_overlap(self, x, target, overlap): """Fold the tensor with overlap for quick batched inference. @@ -430,7 +408,30 @@ class WaveRNN(nn.Module): return folded - def xfade_and_unfold(self, y, target, overlap): + @staticmethod + def get_gru_cell(gru): + gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) + gru_cell.weight_hh.data = gru.weight_hh_l0.data + gru_cell.weight_ih.data = gru.weight_ih_l0.data + gru_cell.bias_hh.data = gru.bias_hh_l0.data + gru_cell.bias_ih.data = gru.bias_ih_l0.data + return gru_cell + + @staticmethod + def pad_tensor(x, pad, side="both"): + # NB - this is just a quick method i need right now + # i.e., it won't generalise to other shapes/dims + b, t, c = x.size() + total = t + 2 * pad if side == "both" else t + pad + padded = torch.zeros(b, total, c).cuda() + if side in ("before", "both"): + padded[:, pad : pad + t, :] = x + elif side == "after": + padded[:, :t, :] = x + return padded + + @staticmethod + def xfade_and_unfold(y, target, overlap): """Applies a crossfade and unfolds into a 1d array. Args: diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index 705c14dc..6aba5e34 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -28,7 +28,8 @@ def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0): torch.exp(log_std), ) sample = dist.sample() - sample = torch.clamp(torch.clamp(sample, min=-scale_factor), max=scale_factor) + sample = torch.clamp(torch.clamp( + sample, min=-scale_factor), max=scale_factor) del dist return sample @@ -58,8 +59,9 @@ def discretized_mix_logistic_loss( # unpack parameters. (B, T, num_mixtures) x 3 logit_probs = y_hat[:, :, :nr_mix] - means = y_hat[:, :, nr_mix : 2 * nr_mix] - log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min) + means = y_hat[:, :, nr_mix: 2 * nr_mix] + log_scales = torch.clamp( + y_hat[:, :, 2 * nr_mix: 3 * nr_mix], min=log_scale_min) # B x T x 1 -> B x T x num_mixtures y = y.expand_as(means) @@ -104,7 +106,8 @@ def discretized_mix_logistic_loss( ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) inner_cond = (y > 0.999).float() inner_out = ( - inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out + inner_cond * log_one_minus_cdf_min + + (1.0 - inner_cond) * inner_inner_out ) cond = (y < -0.999).float() log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out @@ -142,9 +145,9 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): # (B, T) -> (B, T, nr_mix) one_hot = to_one_hot(argmax, nr_mix) # select logistic parameters - means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1) + means = torch.sum(y[:, :, nr_mix: 2 * nr_mix] * one_hot, dim=-1) log_scales = torch.clamp( - torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min + torch.sum(y[:, :, 2 * nr_mix: 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min ) # sample from logistic & clip to interval # we don't actually round to the nearest 8bit value when sampling diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index c73c5248..c16fa1ae 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -39,7 +39,7 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): def to_camel(text): text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) def setup_wavernn(c): @@ -67,101 +67,92 @@ def setup_wavernn(c): def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) + MyModel = importlib.import_module('TTS.vocoder.models.' + + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in "melgan_generator": + if c.generator_model in 'melgan_generator': model = MyModel( - in_channels=c.audio["num_mels"], + in_channels=c.audio['num_mels'], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], + upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "melgan_fb_generator": + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'melgan_fb_generator': pass - if c.generator_model in "multiband_melgan_generator": + if c.generator_model in 'multiband_melgan_generator': model = MyModel( - in_channels=c.audio["num_mels"], + in_channels=c.audio['num_mels'], out_channels=4, proj_kernel=7, base_channels=384, - upsample_factors=c.generator_model_params["upsample_factors"], + upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "fullband_melgan_generator": + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'fullband_melgan_generator': model = MyModel( - in_channels=c.audio["num_mels"], + in_channels=c.audio['num_mels'], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], + upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "parallel_wavegan_generator": + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'parallel_wavegan_generator': model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - stacks=c.generator_model_params["stacks"], + num_res_blocks=c.generator_model_params['num_res_blocks'], + stacks=c.generator_model_params['stacks'], res_channels=64, gate_channels=128, skip_channels=64, - aux_channels=c.audio["num_mels"], + aux_channels=c.audio['num_mels'], dropout=0.0, bias=True, use_weight_norm=True, - upsample_factors=c.generator_model_params["upsample_factors"], - ) + upsample_factors=c.generator_model_params['upsample_factors']) return model def setup_discriminator(c): print(" > Discriminator Model: {}".format(c.discriminator_model)) - if "parallel_wavegan" in c.discriminator_model: + if 'parallel_wavegan' in c.discriminator_model: MyModel = importlib.import_module( - "TTS.vocoder.models.parallel_wavegan_discriminator" - ) + 'TTS.vocoder.models.parallel_wavegan_discriminator') else: - MyModel = importlib.import_module( - "TTS.vocoder.models." + c.discriminator_model.lower() - ) + MyModel = importlib.import_module('TTS.vocoder.models.' + + c.discriminator_model.lower()) MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in "random_window_discriminator": + if c.discriminator_model in 'random_window_discriminator': model = MyModel( - cond_channels=c.audio["num_mels"], - hop_length=c.audio["hop_length"], - uncond_disc_donwsample_factors=c.discriminator_model_params[ - "uncond_disc_donwsample_factors" - ], - cond_disc_downsample_factors=c.discriminator_model_params[ - "cond_disc_downsample_factors" - ], - cond_disc_out_channels=c.discriminator_model_params[ - "cond_disc_out_channels" - ], - window_sizes=c.discriminator_model_params["window_sizes"], - ) - if c.discriminator_model in "melgan_multiscale_discriminator": + cond_channels=c.audio['num_mels'], + hop_length=c.audio['hop_length'], + uncond_disc_donwsample_factors=c. + discriminator_model_params['uncond_disc_donwsample_factors'], + cond_disc_downsample_factors=c. + discriminator_model_params['cond_disc_downsample_factors'], + cond_disc_out_channels=c. + discriminator_model_params['cond_disc_out_channels'], + window_sizes=c.discriminator_model_params['window_sizes']) + if c.discriminator_model in 'melgan_multiscale_discriminator': model = MyModel( in_channels=1, out_channels=1, kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params["base_channels"], - max_channels=c.discriminator_model_params["max_channels"], - downsample_factors=c.discriminator_model_params["downsample_factors"], - ) - if c.discriminator_model == "residual_parallel_wavegan_discriminator": + base_channels=c.discriminator_model_params['base_channels'], + max_channels=c.discriminator_model_params['max_channels'], + downsample_factors=c. + discriminator_model_params['downsample_factors']) + if c.discriminator_model == 'residual_parallel_wavegan_discriminator': model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], - stacks=c.discriminator_model_params["stacks"], + num_layers=c.discriminator_model_params['num_layers'], + stacks=c.discriminator_model_params['stacks'], res_channels=64, gate_channels=128, skip_channels=64, @@ -170,17 +161,17 @@ def setup_discriminator(c): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, ) - if c.discriminator_model == "parallel_wavegan_discriminator": + if c.discriminator_model == 'parallel_wavegan_discriminator': model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], + num_layers=c.discriminator_model_params['num_layers'], conv_channels=64, dilation_factor=1, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, - bias=True, + bias=True ) return model From 9270e27cd7df82f5967174da18e0e92967674120 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 10:39:20 +0200 Subject: [PATCH 11/16] add wavernn tests + name refactoring --- tests/inputs/test_vocoder_wavernn_config.json | 94 +++++++++++++++++++ ...tasets.py => test_vocoder_gan_datasets.py} | 0 ...der_train.sh => test_vocoder_gan_train.sh} | 4 +- tests/test_vocoder_wavernn.py | 31 ++++++ tests/test_vocoder_wavernn_datasets.py | 91 ++++++++++++++++++ tests/test_vocoder_wavernn_train.sh | 15 +++ 6 files changed, 233 insertions(+), 2 deletions(-) create mode 100644 tests/inputs/test_vocoder_wavernn_config.json rename tests/{test_vocoder_datasets.py => test_vocoder_gan_datasets.py} (100%) rename tests/{test_vocoder_train.sh => test_vocoder_gan_train.sh} (57%) create mode 100644 tests/test_vocoder_wavernn.py create mode 100644 tests/test_vocoder_wavernn_datasets.py create mode 100755 tests/test_vocoder_wavernn_train.sh diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json new file mode 100644 index 00000000..28c0f059 --- /dev/null +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -0,0 +1,94 @@ +{ + "run_name": "wavernn_test", + "run_description": "wavernn_test training", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length + "seq_len": 1280, // has to be devideable by hop_length + "mode": "mold", // mold [string], gauss [string], bits [int] + "mulaw": false, // apply mulaw if mode is bits + "padding": 2, // pad the input for resnet to see wider input length + + // DATASET + //"use_gta": true, // use computed gta features from the tts model + "data_path": "tests/data/ljspeech/wavs/", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them + + // TRAINING + "batch_size": 4, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "epochs": 1, // total number of epochs to train. + + // VALIDATION + "run_eval": true, + "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) + + // OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [200000, 400000, 600000] + }, + "lr": 1e-4, // initial learning rate + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, // number of samples for testing + + // PATHS + "output_path": "tests/train_outputs/" +} + diff --git a/tests/test_vocoder_datasets.py b/tests/test_vocoder_gan_datasets.py similarity index 100% rename from tests/test_vocoder_datasets.py rename to tests/test_vocoder_gan_datasets.py diff --git a/tests/test_vocoder_train.sh b/tests/test_vocoder_gan_train.sh similarity index 57% rename from tests/test_vocoder_train.sh rename to tests/test_vocoder_gan_train.sh index fa99b4bd..75773cc3 100755 --- a/tests/test_vocoder_train.sh +++ b/tests/test_vocoder_gan_train.sh @@ -5,11 +5,11 @@ echo "$BASEDIR" # create run dir mkdir $BASEDIR/train_outputs # run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json # find the training folder LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) echo $LATEST_FOLDER # continue the previous training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER # remove all the outputs rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER diff --git a/tests/test_vocoder_wavernn.py b/tests/test_vocoder_wavernn.py new file mode 100644 index 00000000..fdb338f9 --- /dev/null +++ b/tests/test_vocoder_wavernn.py @@ -0,0 +1,31 @@ +import numpy as np +import torch +import random +from TTS.vocoder.models.wavernn import WaveRNN + + +def test_wavernn(): + model = WaveRNN( + rnn_dims=512, + fc_dims=512, + mode=10, + mulaw=False, + pad=2, + use_aux_net=True, + use_upsample_net=True, + upsample_factors=[4, 8, 8], + feat_dims=80, + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=256, + sample_rate=22050, + ) + dummy_x = torch.rand((2, 1280)) + dummy_m = torch.rand((2, 80, 9)) + y_size = random.randrange(20, 60) + dummy_y = torch.rand((80, y_size)) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape + output = model.generate(dummy_y, True, 5500, 550, False) + assert np.all(output.shape == (256 * (y_size - 1),)) diff --git a/tests/test_vocoder_wavernn_datasets.py b/tests/test_vocoder_wavernn_datasets.py new file mode 100644 index 00000000..0f4e939a --- /dev/null +++ b/tests/test_vocoder_wavernn_datasets.py @@ -0,0 +1,91 @@ +import os +import shutil + +import numpy as np +from tests import get_tests_path, get_tests_input_path, get_tests_output_path +from torch.utils.data import DataLoader + +from TTS.utils.audio import AudioProcessor +from TTS.utils.io import load_config +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files + +file_path = os.path.dirname(os.path.realpath(__file__)) +OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") +os.makedirs(OUTPATH, exist_ok=True) + +C = load_config(os.path.join(get_tests_input_path(), + "test_vocoder_wavernn_config.json")) + +test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") +test_mel_feat_path = os.path.join(test_data_path, "mel") +test_quant_feat_path = os.path.join(test_data_path, "quant") +ok_ljspeech = os.path.exists(test_data_path) + + +def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): + """ run dataloader with given parameters and check conditions """ + ap = AudioProcessor(**C.audio) + + C.batch_size = batch_size + C.mode = mode + C.seq_len = seq_len + C.data_path = test_data_path + + preprocess_wav_files(test_data_path, C, ap) + _, train_items = load_wav_feat_data( + test_data_path, test_mel_feat_path, 5) + + dataset = WaveRNNDataset(ap=ap, + items=train_items, + seq_len=seq_len, + hop_len=hop_len, + pad=pad, + mode=mode, + ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader(dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=batch_size, + num_workers=num_workers, + pin_memory=True, + ) + + max_iter = 10 + count_iter = 0 + + try: + for data in loader: + x_input, mels, _ = data + expected_feat_shape = (ap.num_mels, + (x_input.shape[-1] // hop_len) + (pad * 2)) + assert np.all( + mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" + + assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] + count_iter += 1 + if count_iter == max_iter: + break + # except AssertionError: + # shutil.rmtree(test_mel_feat_path) + # shutil.rmtree(test_quant_feat_path) + finally: + shutil.rmtree(test_mel_feat_path) + shutil.rmtree(test_quant_feat_path) + + +def test_parametrized_wavernn_dataset(): + ''' test dataloader with different parameters ''' + params = [ + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", 4], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", 0], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, 2], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", 0], + ] + for param in params: + print(param) + wavernn_dataset_case(*param) diff --git a/tests/test_vocoder_wavernn_train.sh b/tests/test_vocoder_wavernn_train.sh new file mode 100755 index 00000000..f2e32116 --- /dev/null +++ b/tests/test_vocoder_wavernn_train.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# create run dir +mkdir $BASEDIR/train_outputs +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER \ No newline at end of file From 6245dd2b93a1c58215fc73c96274ca99c02ccf33 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 10:44:00 +0200 Subject: [PATCH 12/16] added to device cpu/gpu + formatting --- TTS/bin/train_wavernn_vocoder.py | 182 ++++++++++++------------ TTS/vocoder/datasets/wavernn_dataset.py | 34 ++--- TTS/vocoder/models/wavernn.py | 66 +++++---- 3 files changed, 145 insertions(+), 137 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 78984510..66a7c913 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -44,43 +44,41 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not CONFIG.run_eval: loader = None else: - dataset = WaveRNNDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=CONFIG.seq_len, - hop_len=ap.hop_length, - pad=CONFIG.padding, - mode=CONFIG.mode, - is_training=not is_val, - verbose=verbose, - ) + dataset = WaveRNNDataset(ap=ap, + items=eval_data if is_val else train_data, + seq_len=CONFIG.seq_len, + hop_len=ap.hop_length, + pad=CONFIG.padding, + mode=CONFIG.mode, + is_training=not is_val, + verbose=verbose, + ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - shuffle=True, - collate_fn=dataset.collate, - batch_size=CONFIG.batch_size, - num_workers=CONFIG.num_val_loader_workers - if is_val - else CONFIG.num_loader_workers, - pin_memory=True, - ) + loader = DataLoader(dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=CONFIG.batch_size, + num_workers=CONFIG.num_val_loader_workers + if is_val + else CONFIG.num_loader_workers, + pin_memory=True, + ) return loader def format_data(data): # setup input data - x = data[0] - m = data[1] - y = data[2] + x_input = data[0] + mels = data[1] + y_coarse = data[2] # dispatch data to GPU if use_cuda: - x = x.cuda(non_blocking=True) - m = m.cuda(non_blocking=True) - y = y.cuda(non_blocking=True) + x_input = x_input.cuda(non_blocking=True) + mels = mels.cuda(non_blocking=True) + y_coarse = y_coarse.cuda(non_blocking=True) - return x, m, y + return x_input, mels, y_coarse def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): @@ -90,7 +88,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus)) + batch_n_iter = int(len(data_loader.dataset) / + (CONFIG.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) end_time = time.time() @@ -99,30 +98,31 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): print(" > Training", flush=True) for num_iter, data in enumerate(data_loader): start_time = time.time() - x, m, y = format_data(data) + x_input, mels, y_coarse = format_data(data) loader_time = time.time() - end_time global_step += 1 ################## # MODEL TRAINING # ################## - y_hat = model(x, m) + y_hat = model(x_input, mels) if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: - y = y.float() - y = y.unsqueeze(-1) + y_coarse = y_coarse.float() + y_coarse = y_coarse.unsqueeze(-1) # m_scaled, _ = model.upsample(m) # compute losses - loss = criterion(y_hat, y) + loss = criterion(y_hat, y_coarse) if loss.item() is None: raise RuntimeError(" [!] None loss. Exiting ...") optimizer.zero_grad() loss.backward() if CONFIG.grad_clip > 0: - torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip) + torch.nn.utils.clip_grad_norm_( + model.parameters(), CONFIG.grad_clip) optimizer.step() if scheduler is not None: @@ -145,19 +145,17 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): # print training stats if global_step % CONFIG.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr": cur_lr, - } - c_logger.print_train_step( - batch_n_iter, - num_iter, - global_step, - log_dict, - loss_dict, - keep_avg.avg_values, - ) + log_dict = {"step_time": [step_time, 2], + "loader_time": [loader_time, 4], + "current_lr": cur_lr, + } + c_logger.print_train_step(batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) # plot step stats if global_step % 10 == 0: @@ -169,40 +167,38 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): if global_step % CONFIG.save_step == 0: if CONFIG.checkpoint: # save model - save_checkpoint( - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - ) + save_checkpoint(model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) # synthesize a full voice wav_path = train_data[random.randrange(0, len(train_data))][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.generate( - ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, - ) + sample_wav = model.generate(ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms - figures = { - "train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), - } + figures = {"train/ground_truth": plot_spectrogram(ground_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T), + } # Sample audio tb_logger.tb_train_audios( - global_step, {"train/audio": sample_wav}, CONFIG.audio["sample_rate"] + global_step, { + "train/audio": sample_wav}, CONFIG.audio["sample_rate"] ) tb_logger.tb_train_figures(global_step, figures) @@ -234,17 +230,17 @@ def evaluate(model, criterion, ap, global_step, epoch): for num_iter, data in enumerate(data_loader): start_time = time.time() # format data - x, m, y = format_data(data) + x_input, mels, y_coarse = format_data(data) loader_time = time.time() - end_time global_step += 1 - y_hat = model(x, m) + y_hat = model(x_input, mels) if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: - y = y.float() - y = y.unsqueeze(-1) - loss = criterion(y_hat, y) + y_coarse = y_coarse.float() + y_coarse = y_coarse.unsqueeze(-1) + loss = criterion(y_hat, y_coarse) # Compute avg loss # if num_gpus > 1: # loss = reduce_tensor(loss.data, num_gpus) @@ -264,30 +260,31 @@ def evaluate(model, criterion, ap, global_step, epoch): # print eval stats if CONFIG.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) + c_logger.print_eval_step( + num_iter, loss_dict, keep_avg.avg_values) - if epoch % CONFIG.test_every_epochs == 0: + if epoch % CONFIG.test_every_epochs == 0 and epoch != 0: # synthesize a part of data wav_path = eval_data[random.randrange(0, len(eval_data))][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav[:22000]) - sample_wav = model.generate( - ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, - ) + sample_wav = model.generate(ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + use_cuda + ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms - figures = { - "eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } + figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T), + } # Sample audio tb_logger.tb_eval_audios( - global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + global_step, { + "eval/audio": sample_wav}, CONFIG.audio["sample_rate"] ) tb_logger.tb_eval_figures(global_step, figures) @@ -372,7 +369,8 @@ def main(args): # pylint: disable=redefined-outer-name model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) model_wavernn.load_state_dict(model_dict) - print(" > Model restored from step %d" % checkpoint["step"], flush=True) + print(" > Model restored from step %d" % + checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 @@ -393,7 +391,8 @@ def main(args): # pylint: disable=redefined-outer-name _, global_step = train( model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch ) - eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate( + model_wavernn, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict["avg_model_loss"] best_loss = save_best_model( @@ -493,7 +492,8 @@ if __name__ == "__main__": tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0) + tb_logger.tb_add_text("model-description", + CONFIG["run_description"], 0) try: main(args) diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 5d5b9f15..194344a9 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -8,17 +8,16 @@ class WaveRNNDataset(Dataset): WaveRNN Dataset searchs for all the wav files under root path. """ - def __init__( - self, - ap, - items, - seq_len, - hop_len, - pad, - mode, - is_training=True, - verbose=False, - ): + def __init__(self, + ap, + items, + seq_len, + hop_len, + pad, + mode, + is_training=True, + verbose=False, + ): self.ap = ap self.item_list = items @@ -56,17 +55,19 @@ class WaveRNNDataset(Dataset): def collate(self, batch): mel_win = self.seq_len // self.hop_len + 2 * self.pad - max_offsets = [x[0].shape[-1] - (mel_win + 2 * self.pad) for x in batch] + max_offsets = [x[0].shape[-1] - + (mel_win + 2 * self.pad) for x in batch] mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] - sig_offsets = [(offset + self.pad) * self.hop_len for offset in mel_offsets] + sig_offsets = [(offset + self.pad) * + self.hop_len for offset in mel_offsets] mels = [ - x[0][:, mel_offsets[i] : mel_offsets[i] + mel_win] + x[0][:, mel_offsets[i]: mel_offsets[i] + mel_win] for i, x in enumerate(batch) ] coarse = [ - x[1][sig_offsets[i] : sig_offsets[i] + self.seq_len + 1] + x[1][sig_offsets[i]: sig_offsets[i] + self.seq_len + 1] for i, x in enumerate(batch) ] @@ -79,7 +80,8 @@ class WaveRNNDataset(Dataset): coarse = np.stack(coarse).astype(np.int64) coarse = torch.LongTensor(coarse) x_input = ( - 2 * coarse[:, : self.seq_len].float() / (2 ** self.mode - 1.0) - 1.0 + 2 * coarse[:, : self.seq_len].float() / + (2 ** self.mode - 1.0) - 1.0 ) y_coarse = coarse[:, 1:] mels = torch.FloatTensor(mels) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 4d1a633c..9b151cac 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -39,7 +39,8 @@ class MelResNet(nn.Module): def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): super().__init__() k_size = pad * 2 + 1 - self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False) + self.conv_in = nn.Conv1d( + in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() for _ in range(res_blocks): @@ -94,7 +95,8 @@ class UpsampleNetwork(nn.Module): k_size = (1, scale * 2 + 1) padding = (0, scale) stretch = Stretch2d(scale, 1) - conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False) + conv = nn.Conv2d(1, 1, kernel_size=k_size, + padding=padding, bias=False) conv.weight.data.fill_(1.0 / k_size[1]) self.up_layers.append(stretch) self.up_layers.append(conv) @@ -110,7 +112,7 @@ class UpsampleNetwork(nn.Module): m = m.unsqueeze(1) for f in self.up_layers: m = f(m) - m = m.squeeze(1)[:, :, self.indent : -self.indent] + m = m.squeeze(1)[:, :, self.indent: -self.indent] return m.transpose(1, 2), aux @@ -123,7 +125,8 @@ class Upsample(nn.Module): self.pad = pad self.indent = pad * scale self.use_aux_net = use_aux_net - self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad) + self.resnet = MelResNet(res_blocks, feat_dims, + compute_dims, res_out_dims, pad) def forward(self, m): if self.use_aux_net: @@ -137,7 +140,7 @@ class Upsample(nn.Module): m = torch.nn.functional.interpolate( m, scale_factor=self.scale, mode="linear", align_corners=True ) - m = m[:, :, self.indent : -self.indent] + m = m[:, :, self.indent: -self.indent] m = m * 0.045 # empirically found return m.transpose(1, 2), aux @@ -207,7 +210,8 @@ class WaveRNN(nn.Module): if self.use_aux_net: self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims) self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, + rnn_dims, batch_first=True) self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) self.fc3 = nn.Linear(fc_dims, self.n_classes) @@ -221,16 +225,16 @@ class WaveRNN(nn.Module): def forward(self, x, mels): bsize = x.size(0) - h1 = torch.zeros(1, bsize, self.rnn_dims).cuda() - h2 = torch.zeros(1, bsize, self.rnn_dims).cuda() + h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) + h2 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) mels, aux = self.upsample(mels) if self.use_aux_net: aux_idx = [self.aux_dims * i for i in range(5)] - a1 = aux[:, :, aux_idx[0] : aux_idx[1]] - a2 = aux[:, :, aux_idx[1] : aux_idx[2]] - a3 = aux[:, :, aux_idx[2] : aux_idx[3]] - a4 = aux[:, :, aux_idx[3] : aux_idx[4]] + a1 = aux[:, :, aux_idx[0]: aux_idx[1]] + a2 = aux[:, :, aux_idx[1]: aux_idx[2]] + a3 = aux[:, :, aux_idx[2]: aux_idx[3]] + a4 = aux[:, :, aux_idx[3]: aux_idx[4]] x = ( torch.cat([x.unsqueeze(-1), mels, a1], dim=2) @@ -256,19 +260,21 @@ class WaveRNN(nn.Module): x = F.relu(self.fc2(x)) return self.fc3(x) - def generate(self, mels, batched, target, overlap): + def generate(self, mels, batched, target, overlap, use_cuda): self.eval() + device = 'cuda' if use_cuda else 'cpu' output = [] start = time.time() rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad(): - - mels = torch.FloatTensor(mels).cuda().unsqueeze(0) + mels = torch.FloatTensor(mels).unsqueeze(0).to(device) + #mels = torch.FloatTensor(mels).cuda().unsqueeze(0) wave_len = (mels.size(-1) - 1) * self.hop_length - mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both") + mels = self.pad_tensor(mels.transpose( + 1, 2), pad=self.pad, side="both") mels, aux = self.upsample(mels.transpose(1, 2)) if batched: @@ -278,13 +284,13 @@ class WaveRNN(nn.Module): b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).cuda() - h2 = torch.zeros(b_size, self.rnn_dims).cuda() - x = torch.zeros(b_size, 1).cuda() + h1 = torch.zeros(b_size, self.rnn_dims).to(device) + h2 = torch.zeros(b_size, self.rnn_dims).to(device) + x = torch.zeros(b_size, 1).to(device) if self.use_aux_net: d = self.aux_dims - aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] + aux_split = [aux[:, :, d * i: d * (i + 1)] for i in range(4)] for i in range(seq_len): @@ -319,11 +325,12 @@ class WaveRNN(nn.Module): logits.unsqueeze(0).transpose(1, 2) ) output.append(sample.view(-1)) - x = sample.transpose(0, 1).cuda() + x = sample.transpose(0, 1).to(device) elif self.mode == "gauss": - sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) + sample = sample_from_gaussian( + logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) - x = sample.transpose(0, 1).cuda() + x = sample.transpose(0, 1).to(device) elif isinstance(self.mode, int): posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) @@ -332,7 +339,8 @@ class WaveRNN(nn.Module): output.append(sample) x = sample.unsqueeze(-1) else: - raise RuntimeError("Unknown model mode value - ", self.mode) + raise RuntimeError( + "Unknown model mode value - ", self.mode) if i % 100 == 0: self.gen_display(i, seq_len, b_size, start) @@ -352,7 +360,7 @@ class WaveRNN(nn.Module): # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] - output[-20 * self.hop_length :] *= fade_out + output[-20 * self.hop_length:] *= fade_out self.train() return output @@ -366,7 +374,6 @@ class WaveRNN(nn.Module): ) def fold_with_overlap(self, x, target, overlap): - """Fold the tensor with overlap for quick batched inference. Overlap will be used for crossfading in xfade_and_unfold() Args: @@ -398,7 +405,7 @@ class WaveRNN(nn.Module): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side="after") - folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + folded = torch.zeros(num_folds, target + 2 * overlap, features).to(x.device) # Get the values for the folded tensor for i in range(num_folds): @@ -423,16 +430,15 @@ class WaveRNN(nn.Module): # i.e., it won't generalise to other shapes/dims b, t, c = x.size() total = t + 2 * pad if side == "both" else t + pad - padded = torch.zeros(b, total, c).cuda() + padded = torch.zeros(b, total, c).to(x.device) if side in ("before", "both"): - padded[:, pad : pad + t, :] = x + padded[:, pad: pad + t, :] = x elif side == "after": padded[:, :t, :] = x return padded @staticmethod def xfade_and_unfold(y, target, overlap): - """Applies a crossfade and unfolds into a 1d array. Args: y (ndarry) : Batched sequences of audio samples From 4d5da4b663d7a2210a9fe4965ab942ad7557efb0 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 13:22:50 +0200 Subject: [PATCH 13/16] fix travis + pylint tests --- .travis/script | 3 ++- TTS/vocoder/models/wavernn.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis/script b/.travis/script index 0c24a221..0860f9cf 100755 --- a/.travis/script +++ b/.travis/script @@ -17,5 +17,6 @@ fi if [[ "$TEST_SUITE" == "testscripts" ]]; then # test model training scripts ./tests/test_tts_train.sh - ./tests/test_vocoder_train.sh + ./tests/test_vocoder_gan_train.sh + ./tests/test_vocoder_wavernn_train.sh fi diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 9b151cac..8a45d9e3 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -225,7 +225,7 @@ class WaveRNN(nn.Module): def forward(self, x, mels): bsize = x.size(0) - h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) + h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) h2 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) mels, aux = self.upsample(mels) From 4a989e3cebf68ef9ae2ab4f675fcfbbeb983288a Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 09:45:37 +0100 Subject: [PATCH 14/16] compute audio feat on dataload --- TTS/bin/train_wavernn_vocoder.py | 175 ++++++++++++------------ TTS/vocoder/configs/wavernn_config.json | 143 +++++++++---------- TTS/vocoder/datasets/wavernn_dataset.py | 68 ++++++--- TTS/vocoder/models/wavernn.py | 60 ++++---- 4 files changed, 243 insertions(+), 203 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 66a7c913..91a62cbe 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -29,8 +29,8 @@ from TTS.utils.generic_utils import ( from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( find_feat_files, - load_wav_feat_data, - preprocess_wav_files, + load_wav_data, + load_wav_feat_data ) from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss from TTS.vocoder.utils.generic_utils import setup_wavernn @@ -41,15 +41,16 @@ use_cuda, num_gpus = setup_torch_training_env(True, True) def setup_loader(ap, is_val=False, verbose=False): - if is_val and not CONFIG.run_eval: + if is_val and not c.run_eval: loader = None else: dataset = WaveRNNDataset(ap=ap, items=eval_data if is_val else train_data, - seq_len=CONFIG.seq_len, + seq_len=c.seq_len, hop_len=ap.hop_length, - pad=CONFIG.padding, - mode=CONFIG.mode, + pad=c.padding, + mode=c.mode, + mulaw=c.mulaw, is_training=not is_val, verbose=verbose, ) @@ -57,10 +58,10 @@ def setup_loader(ap, is_val=False, verbose=False): loader = DataLoader(dataset, shuffle=True, collate_fn=dataset.collate, - batch_size=CONFIG.batch_size, - num_workers=CONFIG.num_val_loader_workers + batch_size=c.batch_size, + num_workers=c.num_val_loader_workers if is_val - else CONFIG.num_loader_workers, + else c.num_loader_workers, pin_memory=True, ) return loader @@ -89,9 +90,9 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): keep_avg = KeepAverage() if use_cuda: batch_n_iter = int(len(data_loader.dataset) / - (CONFIG.batch_size * num_gpus)) + (c.batch_size * num_gpus)) else: - batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) + batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() c_logger.print_train_start() # train loop @@ -102,9 +103,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): loader_time = time.time() - end_time global_step += 1 - ################## - # MODEL TRAINING # - ################## y_hat = model(x_input, mels) if isinstance(model.mode, int): @@ -112,7 +110,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): else: y_coarse = y_coarse.float() y_coarse = y_coarse.unsqueeze(-1) - # m_scaled, _ = model.upsample(m) # compute losses loss = criterion(y_hat, y_coarse) @@ -120,11 +117,11 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): raise RuntimeError(" [!] None loss. Exiting ...") optimizer.zero_grad() loss.backward() - if CONFIG.grad_clip > 0: + if c.grad_clip > 0: torch.nn.utils.clip_grad_norm_( - model.parameters(), CONFIG.grad_clip) - + model.parameters(), c.grad_clip) optimizer.step() + if scheduler is not None: scheduler.step() @@ -144,7 +141,7 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): keep_avg.update_values(update_train_values) # print training stats - if global_step % CONFIG.print_step == 0: + if global_step % c.print_step == 0: log_dict = {"step_time": [step_time, 2], "loader_time": [loader_time, 4], "current_lr": cur_lr, @@ -164,8 +161,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): tb_logger.tb_train_iter_stats(global_step, iter_stats) # save checkpoint - if global_step % CONFIG.save_step == 0: - if CONFIG.checkpoint: + if global_step % c.save_step == 0: + if c.checkpoint: # save model save_checkpoint(model, optimizer, @@ -180,28 +177,30 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) # synthesize a full voice - wav_path = train_data[random.randrange(0, len(train_data))][0] + rand_idx = random.randrange(0, len(train_data)) + wav_path = train_data[rand_idx] if not isinstance( + train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) sample_wav = model.generate(ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, + c.batched, + c.target_samples, + c.overlap_samples, + use_cuda ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms figures = {"train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T) } + tb_logger.tb_train_figures(global_step, figures) # Sample audio tb_logger.tb_train_audios( global_step, { - "train/audio": sample_wav}, CONFIG.audio["sample_rate"] + "train/audio": sample_wav}, c.audio["sample_rate"] ) - - tb_logger.tb_train_figures(global_step, figures) end_time = time.time() # print epoch stats @@ -259,34 +258,35 @@ def evaluate(model, criterion, ap, global_step, epoch): keep_avg.update_values(update_eval_values) # print eval stats - if CONFIG.print_eval: + if c.print_eval: c_logger.print_eval_step( num_iter, loss_dict, keep_avg.avg_values) - if epoch % CONFIG.test_every_epochs == 0 and epoch != 0: - # synthesize a part of data - wav_path = eval_data[random.randrange(0, len(eval_data))][0] + if epoch % c.test_every_epochs == 0 and epoch != 0: + # synthesize a full voice + rand_idx = random.randrange(0, len(eval_data)) + wav_path = eval_data[rand_idx] if not isinstance( + eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav[:22000]) + ground_mel = ap.melspectrogram(wav) sample_wav = model.generate(ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, + c.batched, + c.target_samples, + c.overlap_samples, use_cuda ) predict_mel = ap.melspectrogram(sample_wav) - # compute spectrograms - figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } - # Sample audio tb_logger.tb_eval_audios( global_step, { - "eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + "eval/audio": sample_wav}, c.audio["sample_rate"] ) + # compute spectrograms + figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T) + } tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -299,53 +299,62 @@ def main(args): # pylint: disable=redefined-outer-name global train_data, eval_data # setup audio processor - ap = AudioProcessor(**CONFIG.audio) + ap = AudioProcessor(**c.audio) - print(f" > Loading wavs from: {CONFIG.data_path}") - if CONFIG.feature_path is not None: - print(f" > Loading features from: {CONFIG.feature_path}") + # print(f" > Loading wavs from: {c.data_path}") + # if c.feature_path is not None: + # print(f" > Loading features from: {c.feature_path}") + # eval_data, train_data = load_wav_feat_data( + # c.data_path, c.feature_path, c.eval_split_size + # ) + # else: + # mel_feat_path = os.path.join(OUT_PATH, "mel") + # feat_data = find_feat_files(mel_feat_path) + # if feat_data: + # print(f" > Loading features from: {mel_feat_path}") + # eval_data, train_data = load_wav_feat_data( + # c.data_path, mel_feat_path, c.eval_split_size + # ) + # else: + # print(" > No feature data found. Preprocessing...") + # # preprocessing feature data from given wav files + # preprocess_wav_files(OUT_PATH, CONFIG, ap) + # eval_data, train_data = load_wav_feat_data( + # c.data_path, mel_feat_path, c.eval_split_size + # ) + + print(f" > Loading wavs from: {c.data_path}") + if c.feature_path is not None: + print(f" > Loading features from: {c.feature_path}") eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size - ) + c.data_path, c.feature_path, c.eval_split_size) else: - mel_feat_path = os.path.join(OUT_PATH, "mel") - feat_data = find_feat_files(mel_feat_path) - if feat_data: - print(f" > Loading features from: {mel_feat_path}") - eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size - ) - else: - print(" > No feature data found. Preprocessing...") - # preprocessing feature data from given wav files - preprocess_wav_files(OUT_PATH, CONFIG, ap) - eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size - ) + eval_data, train_data = load_wav_data( + c.data_path, c.eval_split_size) # setup model - model_wavernn = setup_wavernn(CONFIG) + model_wavernn = setup_wavernn(c) # define train functions - if CONFIG.mode == "mold": + if c.mode == "mold": criterion = discretized_mix_logistic_loss - elif CONFIG.mode == "gauss": + elif c.mode == "gauss": criterion = gaussian_loss - elif isinstance(CONFIG.mode, int): + elif isinstance(c.mode, int): criterion = torch.nn.CrossEntropyLoss() if use_cuda: model_wavernn.cuda() - if isinstance(CONFIG.mode, int): + if isinstance(c.mode, int): criterion.cuda() - optimizer = RAdam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + optimizer = RAdam(model_wavernn.parameters(), lr=c.lr, weight_decay=0) scheduler = None - if "lr_scheduler" in CONFIG: - scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) - scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params) + if "lr_scheduler" in c: + scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) + scheduler = scheduler(optimizer, **c.lr_scheduler_params) # slow start for the first 5 epochs - # lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1) + # lr_lambda = lambda epoch: min(epoch / c.warmup_steps, 1) # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) # restore any checkpoint @@ -366,7 +375,7 @@ def main(args): # pylint: disable=redefined-outer-name # retore only matching layers. print(" > Partial model initialization...") model_dict = model_wavernn.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model_wavernn.load_state_dict(model_dict) print(" > Model restored from step %d" % @@ -386,11 +395,10 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float("inf") global_step = args.restore_step - for epoch in range(0, CONFIG.epochs): - c_logger.print_epoch_start(epoch, CONFIG.epochs) - _, global_step = train( - model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch - ) + for epoch in range(0, c.epochs): + c_logger.print_epoch_start(epoch, c.epochs) + _, global_step = train(model_wavernn, optimizer, + criterion, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate( model_wavernn, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) @@ -462,14 +470,14 @@ if __name__ == "__main__": print(f" > Training continues for {args.restore_path}") # setup output paths and read configs - CONFIG = load_config(args.config_path) + c = load_config(args.config_path) # check_config(c) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path if args.continue_path == "": OUT_PATH = create_experiment_folder( - CONFIG.output_path, CONFIG.run_name, args.debug + c.output_path, c.run_name, args.debug ) AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") @@ -483,7 +491,7 @@ if __name__ == "__main__": new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() copy_config_file( - args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + args.config_path, os.path.join(OUT_PATH, "c.json"), new_fields ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) @@ -492,8 +500,7 @@ if __name__ == "__main__": tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text("model-description", - CONFIG["run_description"], 0) + tb_logger.tb_add_text("model-description", c["run_description"], 0) try: main(args) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 8e6a8c32..9a9fbdae 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,94 +1,97 @@ { "run_name": "wavernn_test", "run_description": "wavernn_test training", - - // AUDIO PARAMETERS - "audio":{ - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. + +// AUDIO PARAMETERS + "audio": { + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. // Silence trimming - "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. - + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, - - // Generating / Synthesizing - "batched": true, - "target_samples": 11000, // target number of samples to be generated in each batch entry - "overlap_samples": 550, // number of samples for crossfading between batches - + +// Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches // DISTRIBUTED TRAINING // "distributed":{ // "backend": "nccl", // "url": "tcp:\/\/localhost:54321" // }, - - // MODEL PARAMETERS - "use_aux_net": true, - "use_upsample_net": true, - "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length - "seq_len": 1280, // has to be devideable by hop_length - "mode": "mold", // mold [string], gauss [string], bits [int] - "mulaw": false, // apply mulaw if mode is bits - "padding": 2, // pad the input for resnet to see wider input length - // DATASET - //"use_gta": true, // use computed gta features from the tts model - "data_path": "path/to/wav/files", // path containing training wav files - "feature_path": null, // path containing computed features from wav files if null compute them +// MODEL MODE + "mode": 10, // mold [string], gauss [string], bits [int] + "mulaw": true, // apply mulaw if mode is bits + +// MODEL PARAMETERS + "wavernn_model_params": { + "rnn_dims": 512, + "fc_dims": 512, + "compute_dims": 128, + "res_out_dims": 128, + "num_res_blocks": 10, + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8] // this needs to correctly factorise hop_length + }, + +// DATASET + //"use_gta": true, // use computed gta features from the tts model + "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them + "seq_len": 1280, // has to be devideable by hop_length + "padding": 2, // pad the input for resnet to see wider input length + +// TRAINING + "batch_size": 64, // Batch size for training. + "epochs": 10000, // total number of epochs to train. - // TRAINING - "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. - "epochs": 10000, // total number of epochs to train. - - // VALIDATION +// VALIDATION "run_eval": true, - "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) - - // OPTIMIZER - "grad_clip": 4, // apply gradient clipping if > 0 - "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "test_every_epochs": 10, // Test after set number of epochs (Test every 10 epochs for example) + +// OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "lr_scheduler_params": { "gamma": 0.5, "milestones": [200000, 400000, 600000] }, - "lr": 1e-4, // initial learning rate - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. - "print_eval": false, // If True, it prints loss values for each step in eval run. - "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 50, // number of samples for testing - - // PATHS + "lr": 1e-4, // initial learning rate + +// TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + +// DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 50, // number of samples for testing + +// PATHS "output_path": "output/training/path" } - diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 194344a9..3dbb2194 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,11 +1,13 @@ import torch import numpy as np from torch.utils.data import Dataset +from multiprocessing import Manager class WaveRNNDataset(Dataset): """ - WaveRNN Dataset searchs for all the wav files under root path. + WaveRNN Dataset searchs for all the wav files under root path + and converts them to acoustic features on the fly. """ def __init__(self, @@ -15,16 +17,19 @@ class WaveRNNDataset(Dataset): hop_len, pad, mode, + mulaw, is_training=True, verbose=False, ): self.ap = ap + self.compute_feat = not isinstance(items[0], (tuple, list)) self.item_list = items self.seq_len = seq_len self.hop_len = hop_len self.pad = pad self.mode = mode + self.mulaw = mulaw self.is_training = is_training self.verbose = verbose @@ -36,22 +41,47 @@ class WaveRNNDataset(Dataset): return item def load_item(self, index): - wavpath, feat_path = self.item_list[index] - m = np.load(feat_path.replace("/quant/", "/mel/")) - # x = self.wav_cache[index] - if m.shape[-1] < 5: - print(" [!] Instance is too short! : {}".format(wavpath)) - self.item_list[index] = self.item_list[index + 1] - feat_path = self.item_list[index] - m = np.load(feat_path.replace("/quant/", "/mel/")) - if self.mode in ["gauss", "mold"]: - # x = np.load(feat_path.replace("/mel/", "/quant/")) - x = self.ap.load_wav(wavpath) - elif isinstance(self.mode, int): - x = np.load(feat_path.replace("/mel/", "/quant/")) + """ + load (audio, feat) couple if feature_path is set + else compute it on the fly + """ + if self.compute_feat: + + wavpath = self.item_list[index] + audio = self.ap.load_wav(wavpath) + mel = self.ap.melspectrogram(audio) + + if mel.shape[-1] < 5: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + audio = self.ap.load_wav(wavpath) + mel = self.ap.melspectrogram(audio) + if self.mode in ["gauss", "mold"]: + x_input = audio + elif isinstance(self.mode, int): + x_input = (self.ap.mulaw_encode(audio, qc=self.mode) + if self.mulaw else self.ap.quantize(audio, bits=self.mode)) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + else: - raise RuntimeError("Unknown dataset mode - ", self.mode) - return m, x + + wavpath, feat_path = self.item_list[index] + mel = np.load(feat_path.replace("/quant/", "/mel/")) + + if mel.shape[-1] < 5: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + feat_path = self.item_list[index] + mel = np.load(feat_path.replace("/quant/", "/mel/")) + if self.mode in ["gauss", "mold"]: + x_input = self.ap.load_wav(wavpath) + elif isinstance(self.mode, int): + x_input = np.load(feat_path.replace("/mel/", "/quant/")) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + + return mel, x_input def collate(self, batch): mel_win = self.seq_len // self.hop_len + 2 * self.pad @@ -79,10 +109,8 @@ class WaveRNNDataset(Dataset): elif isinstance(self.mode, int): coarse = np.stack(coarse).astype(np.int64) coarse = torch.LongTensor(coarse) - x_input = ( - 2 * coarse[:, : self.seq_len].float() / - (2 ** self.mode - 1.0) - 1.0 - ) + x_input = (2 * coarse[:, : self.seq_len].float() / + (2 ** self.mode - 1.0) - 1.0) y_coarse = coarse[:, 1:] mels = torch.FloatTensor(mels) return x_input, mels, y_coarse diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 8a45d9e3..f771175c 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -36,14 +36,14 @@ class ResBlock(nn.Module): class MelResNet(nn.Module): - def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): + def __init__(self, num_res_blocks, in_dims, compute_dims, res_out_dims, pad): super().__init__() k_size = pad * 2 + 1 self.conv_in = nn.Conv1d( in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() - for _ in range(res_blocks): + for _ in range(num_res_blocks): self.layers.append(ResBlock(compute_dims)) self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) @@ -76,7 +76,7 @@ class UpsampleNetwork(nn.Module): feat_dims, upsample_scales, compute_dims, - res_blocks, + num_res_blocks, res_out_dims, pad, use_aux_net, @@ -87,7 +87,7 @@ class UpsampleNetwork(nn.Module): self.use_aux_net = use_aux_net if use_aux_net: self.resnet = MelResNet( - res_blocks, feat_dims, compute_dims, res_out_dims, pad + num_res_blocks, feat_dims, compute_dims, res_out_dims, pad ) self.resnet_stretch = Stretch2d(self.total_scale, 1) self.up_layers = nn.ModuleList() @@ -118,14 +118,14 @@ class UpsampleNetwork(nn.Module): class Upsample(nn.Module): def __init__( - self, scale, pad, res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net + self, scale, pad, num_res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net ): super().__init__() self.scale = scale self.pad = pad self.indent = pad * scale self.use_aux_net = use_aux_net - self.resnet = MelResNet(res_blocks, feat_dims, + self.resnet = MelResNet(num_res_blocks, feat_dims, compute_dims, res_out_dims, pad) def forward(self, m): @@ -147,23 +147,22 @@ class Upsample(nn.Module): class WaveRNN(nn.Module): - def __init__( - self, - rnn_dims, - fc_dims, - mode, - mulaw, - pad, - use_aux_net, - use_upsample_net, - upsample_factors, - feat_dims, - compute_dims, - res_out_dims, - res_blocks, - hop_length, - sample_rate, - ): + def __init__(self, + rnn_dims, + fc_dims, + mode, + mulaw, + pad, + use_aux_net, + use_upsample_net, + upsample_factors, + feat_dims, + compute_dims, + res_out_dims, + num_res_blocks, + hop_length, + sample_rate, + ): super().__init__() self.mode = mode self.mulaw = mulaw @@ -177,7 +176,7 @@ class WaveRNN(nn.Module): elif self.mode == "gauss": self.n_classes = 2 else: - raise RuntimeError(" > Unknown training mode") + raise RuntimeError("Unknown model mode value - ", self.mode) self.rnn_dims = rnn_dims self.aux_dims = res_out_dims // 4 @@ -192,7 +191,7 @@ class WaveRNN(nn.Module): feat_dims, upsample_factors, compute_dims, - res_blocks, + num_res_blocks, res_out_dims, pad, use_aux_net, @@ -201,7 +200,7 @@ class WaveRNN(nn.Module): self.upsample = Upsample( hop_length, pad, - res_blocks, + num_res_blocks, feat_dims, compute_dims, res_out_dims, @@ -260,7 +259,7 @@ class WaveRNN(nn.Module): x = F.relu(self.fc2(x)) return self.fc3(x) - def generate(self, mels, batched, target, overlap, use_cuda): + def generate(self, mels, batched, target, overlap, use_cuda=False): self.eval() device = 'cuda' if use_cuda else 'cpu' @@ -360,7 +359,9 @@ class WaveRNN(nn.Module): # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] - output[-20 * self.hop_length:] *= fade_out + + if wave_len > len(fade_out): + output[-20 * self.hop_length:] *= fade_out self.train() return output @@ -405,7 +406,8 @@ class WaveRNN(nn.Module): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side="after") - folded = torch.zeros(num_folds, target + 2 * overlap, features).to(x.device) + folded = torch.zeros(num_folds, target + 2 * + overlap, features).to(x.device) # Get the values for the folded tensor for i in range(num_folds): From 80f5e39e56fe862eba0248398d4c755232a70d60 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 09:47:04 +0100 Subject: [PATCH 15/16] add model params to config --- TTS/vocoder/utils/generic_utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index c16fa1ae..f9fbba52 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -47,18 +47,18 @@ def setup_wavernn(c): MyModel = importlib.import_module("TTS.vocoder.models.wavernn") MyModel = getattr(MyModel, "WaveRNN") model = MyModel( - rnn_dims=512, - fc_dims=512, + rnn_dims=c.wavernn_model_params['rnn_dims'], + fc_dims=c.wavernn_model_params['fc_dims'], mode=c.mode, mulaw=c.mulaw, pad=c.padding, - use_aux_net=c.use_aux_net, - use_upsample_net=c.use_upsample_net, - upsample_factors=c.upsample_factors, - feat_dims=80, - compute_dims=128, - res_out_dims=128, - res_blocks=10, + use_aux_net=c.wavernn_model_params['use_aux_net'], + use_upsample_net=c.wavernn_model_params['use_upsample_net'], + upsample_factors=c.wavernn_model_params['upsample_factors'], + feat_dims=c.audio['num_mels'], + compute_dims=c.wavernn_model_params['compute_dims'], + res_out_dims=c.wavernn_model_params['res_out_dims'], + num_res_blocks=c.wavernn_model_params['num_res_blocks'], hop_length=c.audio["hop_length"], sample_rate=c.audio["sample_rate"], ) From d158ec0806d545d7a053542670e0c02969c89503 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 10:04:24 +0100 Subject: [PATCH 16/16] fix pylint once again --- TTS/bin/train_wavernn_vocoder.py | 1 - TTS/vocoder/datasets/wavernn_dataset.py | 1 - tests/test_vocoder_wavernn.py | 2 +- tests/test_vocoder_wavernn_datasets.py | 17 +++++++++-------- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 91a62cbe..61664a65 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -28,7 +28,6 @@ from TTS.utils.generic_utils import ( ) from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( - find_feat_files, load_wav_data, load_wav_feat_data ) diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 3dbb2194..9c1ded96 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,7 +1,6 @@ import torch import numpy as np from torch.utils.data import Dataset -from multiprocessing import Manager class WaveRNNDataset(Dataset): diff --git a/tests/test_vocoder_wavernn.py b/tests/test_vocoder_wavernn.py index fdb338f9..ccd71c56 100644 --- a/tests/test_vocoder_wavernn.py +++ b/tests/test_vocoder_wavernn.py @@ -17,7 +17,7 @@ def test_wavernn(): feat_dims=80, compute_dims=128, res_out_dims=128, - res_blocks=10, + num_res_blocks=10, hop_length=256, sample_rate=22050, ) diff --git a/tests/test_vocoder_wavernn_datasets.py b/tests/test_vocoder_wavernn_datasets.py index 0f4e939a..a95e247a 100644 --- a/tests/test_vocoder_wavernn_datasets.py +++ b/tests/test_vocoder_wavernn_datasets.py @@ -23,7 +23,7 @@ test_quant_feat_path = os.path.join(test_data_path, "quant") ok_ljspeech = os.path.exists(test_data_path) -def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): +def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers): """ run dataloader with given parameters and check conditions """ ap = AudioProcessor(**C.audio) @@ -42,6 +42,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): hop_len=hop_len, pad=pad, mode=mode, + mulaw=mulaw ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, @@ -78,13 +79,13 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): def test_parametrized_wavernn_dataset(): ''' test dataloader with different parameters ''' params = [ - [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, 0], - [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", 4], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, 0], - [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, 0], - [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", 0], - [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, 2], - [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, True, 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", False, 4], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, False, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, True, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", False, 0], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, False, 2], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", False, 0], ] for param in params: print(param)