From 5044f295165a801424415a8ffd7baf890aafa417 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 25 Sep 2020 19:19:13 +0000 Subject: [PATCH 01/98] Bump tensorflow from 2.3.0 to 2.3.1 Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.3.0 to 2.3.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.3.0...v2.3.1) Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- requirements_tests.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index fdec4c57..d70c4b01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch>=1.5 -tensorflow==2.3.0 +tensorflow==2.3.1 numpy>=1.16.0 scipy>=0.19.0 numba==0.48 diff --git a/requirements_tests.txt b/requirements_tests.txt index f37cda19..73f684a3 100644 --- a/requirements_tests.txt +++ b/requirements_tests.txt @@ -1,5 +1,5 @@ torch>=1.5 -tensorflow==2.3.0 +tensorflow==2.3.1 numpy>=1.16.0 scipy>=0.19.0 numba==0.48 From d6bd3cd8b8981add60fa727dd6385b2f1d99822a Mon Sep 17 00:00:00 2001 From: Alex K Date: Thu, 15 Oct 2020 19:14:50 +0200 Subject: [PATCH 02/98] add initial wavernn support --- TTS/bin/compute_statistics.py | 53 +- ...{train_vocoder.py => train_gan_vocoder.py} | 344 ++++++------ TTS/bin/train_wavernn_vocoder.py | 493 ++++++++++++++++++ TTS/vocoder/configs/wavernn_config.json | 95 ++++ TTS/vocoder/datasets/preprocess.py | 8 +- TTS/vocoder/utils/generic_utils.py | 20 + 6 files changed, 838 insertions(+), 175 deletions(-) rename TTS/bin/{train_vocoder.py => train_gan_vocoder.py} (68%) create mode 100644 TTS/bin/train_wavernn_vocoder.py create mode 100644 TTS/vocoder/configs/wavernn_config.json diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 1c6ef94d..9177c75b 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -11,20 +11,27 @@ from TTS.tts.datasets.preprocess import load_meta_data from TTS.utils.io import load_config from TTS.utils.audio import AudioProcessor + def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( - description="Compute mean and variance of spectrogtram features.") - parser.add_argument("--config_path", type=str, required=True, - help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", default=None, type=str, - help="directory to save the output file.") + description="Compute mean and variance of spectrogtram features." + ) + parser.add_argument( + "--config_path", + type=str, + required=True, + help="TTS config file path to define audio processin parameters.", + ) + parser.add_argument( + "--out_path", default=None, type=str, help="directory to save the output file." + ) args = parser.parse_args() # load config CONFIG = load_config(args.config_path) - CONFIG.audio['signal_norm'] = False # do not apply earlier normalization - CONFIG.audio['stats_path'] = None # discard pre-defined stats + CONFIG.audio["signal_norm"] = False # do not apply earlier normalization + CONFIG.audio["stats_path"] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) @@ -58,27 +65,27 @@ def main(): output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} - stats['mel_mean'] = mel_mean - stats['mel_std'] = mel_scale - stats['linear_mean'] = linear_mean - stats['linear_std'] = linear_scale + stats["mel_mean"] = mel_mean + stats["mel_std"] = mel_scale + stats["linear_mean"] = linear_mean + stats["linear_std"] = linear_scale - print(f' > Avg mel spec mean: {mel_mean.mean()}') - print(f' > Avg mel spec scale: {mel_scale.mean()}') - print(f' > Avg linear spec mean: {linear_mean.mean()}') - print(f' > Avg lienar spec scale: {linear_scale.mean()}') + print(f" > Avg mel spec mean: {mel_mean.mean()}") + print(f" > Avg mel spec scale: {mel_scale.mean()}") + print(f" > Avg linear spec mean: {linear_mean.mean()}") + print(f" > Avg lienar spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling - CONFIG.audio['stats_path'] = output_file_path - CONFIG.audio['signal_norm'] = True + CONFIG.audio["stats_path"] = output_file_path + CONFIG.audio["signal_norm"] = True # remove redundant values - del CONFIG.audio['max_norm'] - del CONFIG.audio['min_level_db'] - del CONFIG.audio['symmetric_norm'] - del CONFIG.audio['clip_norm'] - stats['audio_config'] = CONFIG.audio + del CONFIG.audio["max_norm"] + del CONFIG.audio["min_level_db"] + del CONFIG.audio["symmetric_norm"] + del CONFIG.audio["clip_norm"] + stats["audio_config"] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) - print(f' > scale_stats.npy is saved to {output_file_path}') + print(f" > scale_stats.npy is saved to {output_file_path}") if __name__ == "__main__": diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_gan_vocoder.py similarity index 68% rename from TTS/bin/train_vocoder.py rename to TTS/bin/train_gan_vocoder.py index b51a55a3..7689c930 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_gan_vocoder.py @@ -10,20 +10,29 @@ import torch from torch.utils.data import DataLoader from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict) +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, +) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data + # from distribute import (DistributedSampler, apply_gradient_allreduce, # init_distributed, reduce_tensor) from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, - setup_generator) +from TTS.vocoder.utils.generic_utils import ( + plot_results, + setup_discriminator, + setup_generator, +) from TTS.vocoder.utils.io import save_best_model, save_checkpoint use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -33,27 +42,30 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not c.run_eval: loader = None else: - dataset = GANDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose) + dataset = GANDataset( + ap=ap, + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=not is_val, + use_noise_augment=c.use_noise_augment, + use_cache=c.use_cache, + verbose=verbose, + ) dataset.shuffle_mapping() # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader(dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=True, - drop_last=False, - sampler=None, - num_workers=c.num_val_loader_workers - if is_val else c.num_loader_workers, - pin_memory=False) + loader = DataLoader( + dataset, + batch_size=1 if is_val else c.batch_size, + shuffle=True, + drop_last=False, + sampler=None, + num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, + pin_memory=False, + ) return loader @@ -80,16 +92,26 @@ def format_data(data): return co, x, None, None -def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, - scheduler_G, scheduler_D, ap, global_step, epoch): +def train( + model_G, + criterion_G, + optimizer_G, + model_D, + criterion_D, + optimizer_D, + scheduler_G, + scheduler_D, + ap, + global_step, + epoch, +): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model_G.train() model_D.train() epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int( - len(data_loader.dataset) / (c.batch_size * num_gpus)) + batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() @@ -145,16 +167,16 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, scores_fake = D_out_fake # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) - loss_G = loss_G_dict['G_loss'] + loss_G_dict = criterion_G( + y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub + ) + loss_G = loss_G_dict["G_loss"] # optimizer generator optimizer_G.zero_grad() loss_G.backward() if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), - c.gen_clip_grad) + torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) optimizer_G.step() if scheduler_G is not None: scheduler_G.step() @@ -199,14 +221,13 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute losses loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict['D_loss'] + loss_D = loss_D_dict["D_loss"] # optimizer discriminator optimizer_D.zero_grad() loss_D.backward() if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), - c.disc_clip_grad) + torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) optimizer_D.step() if scheduler_D is not None: scheduler_D.step() @@ -221,34 +242,40 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, epoch_time += step_time # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]['lr'] - current_lr_D = list(optimizer_D.param_groups)[0]['lr'] + current_lr_G = list(optimizer_G.param_groups)[0]["lr"] + current_lr_D = list(optimizer_D.param_groups)[0]["lr"] # update avg stats update_train_values = dict() for key, value in loss_dict.items(): - update_train_values['avg_' + key] = value - update_train_values['avg_loader_time'] = loader_time - update_train_values['avg_step_time'] = step_time + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time keep_avg.update_values(update_train_values) # print training stats if global_step % c.print_step == 0: log_dict = { - 'step_time': [step_time, 2], - 'loader_time': [loader_time, 4], + "step_time": [step_time, 2], + "loader_time": [loader_time, 4], "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D + "current_lr_D": current_lr_D, } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, - log_dict, loss_dict, keep_avg.avg_values) + c_logger.print_train_step( + batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) # plot step stats if global_step % 10 == 0: iter_stats = { "lr_G": current_lr_G, "lr_D": current_lr_D, - "step_time": step_time + "step_time": step_time, } iter_stats.update(loss_dict) tb_logger.tb_train_iter_stats(global_step, iter_stats) @@ -257,27 +284,28 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict) + save_checkpoint( + model_G, + optimizer_G, + scheduler_G, + model_D, + optimizer_D, + scheduler_D, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + tb_logger.tb_train_audios( + global_step, {"train/audio": sample_voice}, c.audio["sample_rate"] + ) end_time = time.time() # print epoch stats @@ -326,7 +354,6 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) y_hat = model_G.pqmf_synthesis(y_hat) y_G_sub = model_G.pqmf_analysis(y_G) - scores_fake, feats_fake, feats_real = None, None, None if global_step > c.steps_to_start_discriminator: @@ -352,8 +379,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) feats_fake, feats_real = None, None # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) + loss_G_dict = criterion_G( + y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub + ) loss_dict = dict() for key, value in loss_G_dict.items(): @@ -403,16 +431,15 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) else: loss_dict[key] = value.item() - step_time = time.time() - start_time epoch_time += step_time # update avg stats update_eval_values = dict() for key, value in loss_dict.items(): - update_eval_values['avg_' + key] = value - update_eval_values['avg_loader_time'] = loader_time - update_eval_values['avg_step_time'] = step_time + update_eval_values["avg_" + key] = value + update_eval_values["avg_loader_time"] = loader_time + update_eval_values["avg_step_time"] = step_time keep_avg.update_values(update_eval_values) # print eval stats @@ -420,13 +447,14 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, 'eval') + figures = plot_results(y_hat, y_G, ap, global_step, "eval") tb_logger.tb_eval_figures(global_step, figures) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) + tb_logger.tb_eval_audios( + global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"] + ) # synthesize a full voice data_loader.return_segments = False @@ -443,7 +471,9 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + eval_data, train_data = load_wav_feat_data( + c.data_path, c.feature_path, c.eval_split_size + ) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -461,17 +491,15 @@ def main(args): # pylint: disable=redefined-outer-name # setup optimizers optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0) - optimizer_disc = RAdam(model_disc.parameters(), - lr=c.lr_disc, - weight_decay=0) + optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0) # schedulers scheduler_gen = None scheduler_disc = None - if 'lr_scheduler_gen' in c: + if "lr_scheduler_gen" in c: scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if 'lr_scheduler_disc' in c: + if "lr_scheduler_disc" in c: scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) @@ -480,47 +508,46 @@ def main(args): # pylint: disable=redefined-outer-name criterion_disc = DiscriminatorLoss(c) if args.restore_path: - checkpoint = torch.load(args.restore_path, map_location='cpu') + checkpoint = torch.load(args.restore_path, map_location="cpu") try: print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint['model']) + model_gen.load_state_dict(checkpoint["model"]) print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint['optimizer']) + optimizer_gen.load_state_dict(checkpoint["optimizer"]) print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint['model_disc']) + model_disc.load_state_dict(checkpoint["model_disc"]) print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint['optimizer_disc']) - if 'scheduler' in checkpoint: + optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) + if "scheduler" in checkpoint: print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint['scheduler']) + scheduler_gen.load_state_dict(checkpoint["scheduler"]) # NOTE: Not sure if necessary scheduler_gen.optimizer = optimizer_gen - if 'scheduler_disc' in checkpoint: + if "scheduler_disc" in checkpoint: print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) + scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) scheduler_disc.optimizer = optimizer_disc except RuntimeError: # retore only matching layers. print(" > Partial model initialization...") model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model'], c) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model_gen.load_state_dict(model_dict) model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c) + model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) model_disc.load_state_dict(model_dict) del model_dict # reset lr if not countinuining training. for group in optimizer_gen.param_groups: - group['lr'] = c.lr_gen + group["lr"] = c.lr_gen for group in optimizer_disc.param_groups: - group['lr'] = c.lr_disc + group["lr"] = c.lr_disc - print(" > Model restored from step %d" % checkpoint['step'], - flush=True) - args.restore_step = checkpoint['step'] + print(" > Model restored from step %d" % checkpoint["step"], flush=True) + args.restore_step = checkpoint["step"] else: args.restore_step = 0 @@ -539,75 +566,92 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model_disc) print(" > Discriminator has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): - best_loss = float('inf') + if "best_loss" not in locals(): + best_loss = float("inf") global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model_gen, criterion_gen, optimizer_gen, - model_disc, criterion_disc, optimizer_disc, - scheduler_gen, scheduler_disc, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, - global_step, epoch) + _, global_step = train( + model_gen, + criterion_gen, + optimizer_gen, + model_disc, + criterion_disc, + optimizer_disc, + scheduler_gen, + scheduler_disc, + ap, + global_step, + epoch, + ) + eval_avg_loss_dict = evaluate( + model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch + ) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict) + best_loss = save_best_model( + target_loss, + best_loss, + model_gen, + optimizer_gen, + scheduler_gen, + model_disc, + optimizer_disc, + scheduler_disc, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + ) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--continue_path', + "--continue_path", type=str, - help= - 'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) + help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', + default="", + required="--config_path" not in sys.argv, + ) parser.add_argument( - '--restore_path', + "--restore_path", type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument('--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') + help="Model file to be restored. Use to finetune a model.", + default="", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in sys.argv, + ) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.", + ) # DISTRUBUTED parser.add_argument( - '--rank', + "--rank", type=int, default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') + help="DISTRIBUTED: process rank for distributed training.", + ) + parser.add_argument( + "--group_id", type=str, default="", help="DISTRIBUTED: process group id." + ) args = parser.parse_args() - if args.continue_path != '': + if args.continue_path != "": args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') + args.config_path = os.path.join(args.continue_path, "config.json") list_of_files = glob.glob( - args.continue_path + - "/*.pth.tar") # * means all if need specific format then *.csv + args.continue_path + "/*.pth.tar" + ) # * means all if need specific format then *.csv latest_model_file = max(list_of_files, key=os.path.getctime) args.restore_path = latest_model_file print(f" > Training continues for {args.restore_path}") @@ -618,11 +662,10 @@ if __name__ == '__main__': _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, - args.debug) + if args.continue_path == "": + OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') + AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") c_logger = ConsoleLogger() @@ -632,16 +675,17 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_config_file(args.config_path, - os.path.join(OUT_PATH, 'config.json'), new_fields) + copy_config_file( + args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') + tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + tb_logger.tb_add_text("model-description", c["run_description"], 0) try: main(args) @@ -654,4 +698,4 @@ if __name__ == '__main__': except Exception: # pylint: disable=broad-except remove_experiment_folder(OUT_PATH) traceback.print_exc() - sys.exit(1) + sys.exit(1) \ No newline at end of file diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py new file mode 100644 index 00000000..2f77ab57 --- /dev/null +++ b/TTS/bin/train_wavernn_vocoder.py @@ -0,0 +1,493 @@ +import argparse +import math +import os +import pickle +import shutil +import sys +import traceback +import time +import glob +import random + +import torch +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + + +from TTS.utils.audio import AudioProcessor +from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.io import copy_config_file, load_config +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.utils.tensorboard_logger import TensorboardLogger +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss +from TTS.vocoder.utils.generic_utils import setup_wavernn +from TTS.utils.training import setup_torch_training_env +from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, +) +from TTS.vocoder.utils.io import save_best_model, save_checkpoint + + +use_cuda, num_gpus = setup_torch_training_env(True, True) + + +def setup_loader(ap, is_val=False, verbose=False): + if is_val and not CONFIG.run_eval: + loader = None + else: + dataset = WaveRNNDataset( + ap=ap, + items=eval_data if is_val else train_data, + seq_len=CONFIG.seq_len, + hop_len=ap.hop_length, + pad=CONFIG.padding, + mode=CONFIG.mode, + is_training=not is_val, + verbose=verbose, + ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=CONFIG.batch_size, + num_workers=CONFIG.num_val_loader_workers + if is_val + else CONFIG.num_loader_workers, + pin_memory=True, + ) + return loader + + +def format_data(data): + # setup input data + x = data[0] + m = data[1] + y = data[2] + + # dispatch data to GPU + if use_cuda: + x = x.cuda(non_blocking=True) + m = m.cuda(non_blocking=True) + y = y.cuda(non_blocking=True) + + return x, m, y + + +def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): + # create train loader + data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) + model.train() + epoch_time = 0 + keep_avg = KeepAverage() + if use_cuda: + batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus)) + else: + batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) + end_time = time.time() + c_logger.print_train_start() + # train loop + print(" > Training", flush=True) + for num_iter, data in enumerate(data_loader): + start_time = time.time() + x, m, y = format_data(data) + loader_time = time.time() - end_time + global_step += 1 + + ################## + # MODEL TRAINING # + ################## + y_hat = model(x, m) + y_hat_vis = y_hat # for visualization + + # y_hat = y_hat.transpose(1, 2) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y = y.float() + y = y.unsqueeze(-1) + # m_scaled, _ = model.upsample(m) + + # compute losses + loss = criterion(y_hat, y) + if loss.item() is None: + raise RuntimeError(" [!] None loss. Exiting ...") + optimizer.zero_grad() + loss.backward() + if CONFIG.grad_clip > 0: + torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip) + + optimizer.step() + if scheduler is not None: + scheduler.step() + + # get the current learning rate + cur_lr = list(optimizer.param_groups)[0]["lr"] + + step_time = time.time() - start_time + epoch_time += step_time + + update_train_values = dict() + loss_dict = dict() + loss_dict["model_loss"] = loss.item() + for key, value in loss_dict.items(): + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time + keep_avg.update_values(update_train_values) + + # print training stats + if global_step % CONFIG.print_step == 0: + log_dict = { + "step_time": [step_time, 2], + "loader_time": [loader_time, 4], + "current_lr": cur_lr, + } + c_logger.print_train_step( + batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) + + # plot step stats + if global_step % 10 == 0: + iter_stats = {"lr": cur_lr, "step_time": step_time} + iter_stats.update(loss_dict) + tb_logger.tb_train_iter_stats(global_step, iter_stats) + + # save checkpoint + if global_step % CONFIG.save_step == 0: + if CONFIG.checkpoint: + # save model + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) + + # synthesize a full voice + wav_path = train_data[random.randrange(0, len(train_data))][0] + wav = ap.load_wav(wav_path) + ground_mel = ap.melspectrogram(wav) + sample_wav = model.generate( + ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) + predict_mel = ap.melspectrogram(sample_wav) + + # Sample audio + tb_logger.tb_train_audios( + global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + # compute spectrograms + figures = { + "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), + "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + } + tb_logger.tb_train_figures(global_step, figures) + end_time = time.time() + + # print epoch stats + c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) + + # Plot Training Epoch Stats + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(keep_avg.avg_values) + tb_logger.tb_train_epoch_stats(global_step, epoch_stats) + # TODO: plot model stats + # if c.tb_model_param_stats: + # tb_logger.tb_model_weights(model, global_step) + return keep_avg.avg_values, global_step + + +@torch.no_grad() +def evaluate(model, criterion, ap, global_step, epoch): + # create train loader + data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) + model.eval() + epoch_time = 0 + keep_avg = KeepAverage() + end_time = time.time() + c_logger.print_eval_start() + with torch.no_grad(): + for num_iter, data in enumerate(data_loader): + start_time = time.time() + # format data + x, m, y = format_data(data) + loader_time = time.time() - end_time + global_step += 1 + + y_hat = model(x, m) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y = y.float() + y = y.unsqueeze(-1) + loss = criterion(y_hat, y) + # Compute avg loss + # if num_gpus > 1: + # loss = reduce_tensor(loss.data, num_gpus) + loss_dict = dict() + loss_dict["model_loss"] = loss.item() + + step_time = time.time() - start_time + epoch_time += step_time + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + update_eval_values["avg_loader_time"] = loader_time + update_eval_values["avg_step_time"] = step_time + keep_avg.update_values(update_eval_values) + + # print eval stats + if CONFIG.print_eval: + c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) + + if epoch > CONFIG.test_delay_epochs: + # synthesize a full voice + wav_path = eval_data[random.randrange(0, len(eval_data))][0] + wav = ap.load_wav(wav_path) + ground_mel = ap.melspectrogram(wav) + sample_wav = model.generate( + ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) + predict_mel = ap.melspectrogram(sample_wav) + + # Sample audio + tb_logger.tb_eval_audios( + global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + # compute spectrograms + figures = { + "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), + "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + } + tb_logger.tb_eval_figures(global_step, figures) + + tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) + return keep_avg.avg_values + + +# FIXME: move args definition/parsing inside of main? +def main(args): # pylint: disable=redefined-outer-name + # pylint: disable=global-variable-undefined + global train_data, eval_data + + print(f" > Loading wavs from: {CONFIG.data_path}") + if CONFIG.feature_path is not None: + print(f" > Loading features from: {CONFIG.feature_path}") + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size + ) + eval_data, train_data = eval_data, train_data + else: + eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) + + # setup audio processor + ap = AudioProcessor(**CONFIG.audio) + + # setup model + model_wavernn = setup_wavernn(CONFIG) + + # define train functions + if CONFIG.mode == "mold": + criterion = discretized_mix_logistic_loss + elif CONFIG.mode == "gauss": + criterion = gaussian_loss + elif isinstance(CONFIG.mode, int): + criterion = torch.nn.CrossEntropyLoss() + + if use_cuda: + model_wavernn.cuda() + if isinstance(CONFIG.mode, int): + criterion.cuda() + + optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + scheduler = None + if "lr_scheduler" in CONFIG: + scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) + scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params) + # slow start for the first 5 epochs + # lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1) + # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + + # restore any checkpoint + if args.restore_path: + checkpoint = torch.load(args.restore_path, map_location="cpu") + try: + print(" > Restoring Model...") + model_wavernn.load_state_dict(checkpoint["model"]) + print(" > Restoring Optimizer...") + optimizer.load_state_dict(checkpoint["optimizer"]) + if "scheduler" in checkpoint: + print(" > Restoring Generator LR Scheduler...") + scheduler.load_state_dict(checkpoint["scheduler"]) + scheduler.optimizer = optimizer + # TODO: fix resetting restored optimizer lr + # optimizer.load_state_dict(checkpoint["optimizer"]) + except RuntimeError: + # retore only matching layers. + print(" > Partial model initialization...") + model_dict = model_wavernn.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) + model_wavernn.load_state_dict(model_dict) + + print(" > Model restored from step %d" % checkpoint["step"], flush=True) + args.restore_step = checkpoint["step"] + else: + args.restore_step = 0 + + # DISTRIBUTED + # if num_gpus > 1: + # model = apply_gradient_allreduce(model) + + num_parameters = count_parameters(model_wavernn) + print(" > Model has {} parameters".format(num_parameters), flush=True) + + if "best_loss" not in locals(): + best_loss = float("inf") + + global_step = args.restore_step + for epoch in range(0, CONFIG.epochs): + c_logger.print_epoch_start(epoch, CONFIG.epochs) + _, global_step = train( + model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch + ) + eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) + c_logger.print_epoch_end(epoch, eval_avg_loss_dict) + target_loss = eval_avg_loss_dict["avg_model_loss"] + best_loss = save_best_model( + target_loss, + best_loss, + model_wavernn, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--continue_path", + type=str, + help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', + default="", + required="--config_path" not in sys.argv, + ) + parser.add_argument( + "--restore_path", + type=str, + help="Model file to be restored. Use to finetune a model.", + default="", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in sys.argv, + ) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.", + ) + + # DISTRUBUTED + parser.add_argument( + "--rank", + type=int, + default=0, + help="DISTRIBUTED: process rank for distributed training.", + ) + parser.add_argument( + "--group_id", type=str, default="", help="DISTRIBUTED: process group id." + ) + args = parser.parse_args() + + if args.continue_path != "": + args.output_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + list_of_files = glob.glob( + args.continue_path + "/*.pth.tar" + ) # * means all if need specific format then *.csv + latest_model_file = max(list_of_files, key=os.path.getctime) + args.restore_path = latest_model_file + print(f" > Training continues for {args.restore_path}") + + # setup output paths and read configs + CONFIG = load_config(args.config_path) + # check_config(c) + _ = os.path.dirname(os.path.realpath(__file__)) + + OUT_PATH = args.continue_path + if args.continue_path == "": + OUT_PATH = create_experiment_folder( + CONFIG.output_path, CONFIG.run_name, args.debug + ) + + AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") + + c_logger = ConsoleLogger() + + if args.rank == 0: + os.makedirs(AUDIO_PATH, exist_ok=True) + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + copy_config_file( + args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + ) + os.chmod(AUDIO_PATH, 0o775) + os.chmod(OUT_PATH, 0o775) + + LOG_DIR = OUT_PATH + tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") + + # write model desc to tensorboard + tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0) + + try: + main(args) + except KeyboardInterrupt: + remove_experiment_folder(OUT_PATH) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(OUT_PATH) + traceback.print_exc() + sys.exit(1) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json new file mode 100644 index 00000000..f7e5d99f --- /dev/null +++ b/TTS/vocoder/configs/wavernn_config.json @@ -0,0 +1,95 @@ +{ + "model": "wavernn", + "run_name": "wavernn_test", + "run_description": "wavernn_test training", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length + "seq_len": 1280, // has to be devideable by hop_length + "mode": "mold", // mold [string], gauss [string], bits [int] + "mulaw": false, // apply mulaw if mode is bits + "padding": 2, // pad the input for resnet to see wider input length + + // DATASET + "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files + "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant) + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "epochs": 10000, // total number of epochs to train. + "warmup_steps": 10, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, // early testing only wastes computation time. + + // OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [200000, 400000, 600000] + }, + "lr": 1e-4, // initial learning rate + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 50, // number of samples for testing + + // PATHS + "output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/" +} + diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index be60c13a..a5365686 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -23,8 +23,12 @@ def load_wav_data(data_path, eval_split_size): def load_wav_feat_data(data_path, feat_path, eval_split_size): - wav_paths = sorted(find_wav_files(data_path)) - feat_paths = sorted(find_feat_files(feat_path)) + wav_paths = find_wav_files(data_path) + feat_paths = find_feat_files(feat_path) + + wav_paths.sort(key=lambda x: Path(x).stem) + feat_paths.sort(key=lambda x: Path(x).stem) + assert len(wav_paths) == len(feat_paths) for wav, feat in zip(wav_paths, feat_paths): wav_name = Path(wav).stem diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 89dc68fb..365d0e11 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -41,6 +41,26 @@ def to_camel(text): text = text.capitalize() return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) +def setup_wavernn(c): + print(" > Model: {}".format(c.model)) + MyModel = importlib.import_module('TTS.vocoder.models.wavernn') + MyModel = getattr(MyModel, "WaveRNN") + model = MyModel( + rnn_dims=512, + fc_dims=512, + mode=c.mode, + mulaw=c.mulaw, + pad=c.padding, + use_aux_net=c.use_aux_net, + use_upsample_net=c.use_upsample_net, + upsample_factors=c.upsample_factors, + feat_dims=80, + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=c.audio['hop_length'], + sample_rate=c.audio['sample_rate']) + return model def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) From 72bd90b497f4e406b3e93ab4a0e77afa2e890e31 Mon Sep 17 00:00:00 2001 From: Alex K Date: Thu, 15 Oct 2020 19:15:53 +0200 Subject: [PATCH 03/98] wavernn stuff... --- TTS/vocoder/datasets/wavernn_dataset.py | 96 +++++ TTS/vocoder/models/wavernn.py | 485 ++++++++++++++++++++++++ TTS/vocoder/utils/distribution.py | 155 ++++++++ 3 files changed, 736 insertions(+) create mode 100644 TTS/vocoder/datasets/wavernn_dataset.py create mode 100644 TTS/vocoder/models/wavernn.py create mode 100644 TTS/vocoder/utils/distribution.py diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py new file mode 100644 index 00000000..b5a7fdad --- /dev/null +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -0,0 +1,96 @@ +import os +import glob +import torch +import numpy as np +from torch.utils.data import Dataset + + +class WaveRNNDataset(Dataset): + """ + WaveRNN Dataset searchs for all the wav files under root path + and converts them to acoustic features on the fly. + """ + + def __init__( + self, + ap, + items, + seq_len, + hop_len, + pad, + mode, + is_training=True, + return_segments=True, + use_cache=False, + verbose=False, + ): + + self.ap = ap + self.item_list = items + self.seq_len = seq_len + self.hop_len = hop_len + self.pad = pad + self.mode = mode + self.is_training = is_training + self.return_segments = return_segments + self.use_cache = use_cache + self.verbose = verbose + + # wav_files = [f"{self.path}wavs/{file}.wav" for file in self.metadata] + # with Pool(4) as pool: + # self.wav_cache = pool.map(self.ap.load_wav, wav_files) + + def __len__(self): + return len(self.item_list) + + def __getitem__(self, index): + item = self.load_item(index) + return item + + def load_item(self, index): + wavpath, feat_path = self.item_list[index] + m = np.load(feat_path.replace("/quant/", "/mel/")) + # x = self.wav_cache[index] + if 5 > m.shape[-1]: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + feat_path = self.item_list[index] + m = np.load(feat_path.replace("/quant/", "/mel/")) + if self.mode in ["gauss", "mold"]: + x = self.ap.load_wav(wavpath) + elif isinstance(self.mode, int): + x = np.load(feat_path.replace("/mel/", "/quant/")) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + return m, x + + def collate(self, batch): + mel_win = self.seq_len // self.hop_len + 2 * self.pad + max_offsets = [x[0].shape[-1] - (mel_win + 2 * self.pad) for x in batch] + mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] + sig_offsets = [(offset + self.pad) * self.hop_len for offset in mel_offsets] + + mels = [ + x[0][:, mel_offsets[i] : mel_offsets[i] + mel_win] + for i, x in enumerate(batch) + ] + + coarse = [ + x[1][sig_offsets[i] : sig_offsets[i] + self.seq_len + 1] + for i, x in enumerate(batch) + ] + + mels = np.stack(mels).astype(np.float32) + if self.mode in ["gauss", "mold"]: + coarse = np.stack(coarse).astype(np.float32) + coarse = torch.FloatTensor(coarse) + x_input = coarse[:, : self.seq_len] + elif isinstance(self.mode, int): + coarse = np.stack(coarse).astype(np.int64) + coarse = torch.LongTensor(coarse) + x_input = ( + 2 * coarse[:, : self.seq_len].float() / (2 ** self.mode - 1.0) - 1.0 + ) + y_coarse = coarse[:, 1:] + mels = torch.FloatTensor(mels) + return x_input, mels, y_coarse diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py new file mode 100644 index 00000000..e1c4365f --- /dev/null +++ b/TTS/vocoder/models/wavernn.py @@ -0,0 +1,485 @@ +import sys +import torch +import torch.nn as nn +import numpy as np +import torch.nn.functional as F +import time + +# fix this +from TTS.utils.audio import AudioProcessor as ap +from TTS.vocoder.utils.distribution import ( + sample_from_gaussian, + sample_from_discretized_mix_logistic, +) + + +def stream(string, variables): + sys.stdout.write(f"\r{string}" % variables) + + +class ResBlock(nn.Module): + def __init__(self, dims): + super().__init__() + self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False) + self.batch_norm1 = nn.BatchNorm1d(dims) + self.batch_norm2 = nn.BatchNorm1d(dims) + + def forward(self, x): + residual = x + x = self.conv1(x) + x = self.batch_norm1(x) + x = F.relu(x) + x = self.conv2(x) + x = self.batch_norm2(x) + return x + residual + + +class MelResNet(nn.Module): + def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): + super().__init__() + k_size = pad * 2 + 1 + self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False) + self.batch_norm = nn.BatchNorm1d(compute_dims) + self.layers = nn.ModuleList() + for i in range(res_blocks): + self.layers.append(ResBlock(compute_dims)) + self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) + + def forward(self, x): + x = self.conv_in(x) + x = self.batch_norm(x) + x = F.relu(x) + for f in self.layers: + x = f(x) + x = self.conv_out(x) + return x + + +class Stretch2d(nn.Module): + def __init__(self, x_scale, y_scale): + super().__init__() + self.x_scale = x_scale + self.y_scale = y_scale + + def forward(self, x): + b, c, h, w = x.size() + x = x.unsqueeze(-1).unsqueeze(3) + x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale) + return x.view(b, c, h * self.y_scale, w * self.x_scale) + + +class UpsampleNetwork(nn.Module): + def __init__( + self, + feat_dims, + upsample_scales, + compute_dims, + res_blocks, + res_out_dims, + pad, + use_aux_net, + ): + super().__init__() + self.total_scale = np.cumproduct(upsample_scales)[-1] + self.indent = pad * self.total_scale + self.use_aux_net = use_aux_net + if use_aux_net: + self.resnet = MelResNet( + res_blocks, feat_dims, compute_dims, res_out_dims, pad + ) + self.resnet_stretch = Stretch2d(self.total_scale, 1) + self.up_layers = nn.ModuleList() + for scale in upsample_scales: + k_size = (1, scale * 2 + 1) + padding = (0, scale) + stretch = Stretch2d(scale, 1) + conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False) + conv.weight.data.fill_(1.0 / k_size[1]) + self.up_layers.append(stretch) + self.up_layers.append(conv) + + def forward(self, m): + if self.use_aux_net: + aux = self.resnet(m).unsqueeze(1) + aux = self.resnet_stretch(aux) + aux = aux.squeeze(1) + aux = aux.transpose(1, 2) + else: + aux = None + m = m.unsqueeze(1) + for f in self.up_layers: + m = f(m) + m = m.squeeze(1)[:, :, self.indent : -self.indent] + return m.transpose(1, 2), aux + + +class Upsample(nn.Module): + def __init__( + self, scale, pad, res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net + ): + super().__init__() + self.scale = scale + self.pad = pad + self.indent = pad * scale + self.use_aux_net = use_aux_net + self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad) + + def forward(self, m): + if self.use_aux_net: + aux = self.resnet(m) + aux = torch.nn.functional.interpolate( + aux, scale_factor=self.scale, mode="linear", align_corners=True + ) + aux = aux.transpose(1, 2) + else: + aux = None + m = torch.nn.functional.interpolate( + m, scale_factor=self.scale, mode="linear", align_corners=True + ) + m = m[:, :, self.indent : -self.indent] + m = m * 0.045 # empirically found + + return m.transpose(1, 2), aux + + +class WaveRNN(nn.Module): + def __init__( + self, + rnn_dims, + fc_dims, + mode, + mulaw, + pad, + use_aux_net, + use_upsample_net, + upsample_factors, + feat_dims, + compute_dims, + res_out_dims, + res_blocks, + hop_length, + sample_rate, + ): + super().__init__() + self.mode = mode + self.mulaw = mulaw + self.pad = pad + self.use_upsample_net = use_upsample_net + self.use_aux_net = use_aux_net + if isinstance(self.mode, int): + self.n_classes = 2 ** self.mode + elif self.mode == "mold": + self.n_classes = 3 * 10 + elif self.mode == "gauss": + self.n_classes = 2 + else: + raise RuntimeError(" > Unknown training mode") + + self.rnn_dims = rnn_dims + self.aux_dims = res_out_dims // 4 + self.hop_length = hop_length + self.sample_rate = sample_rate + + if self.use_upsample_net: + assert ( + np.cumproduct(upsample_factors)[-1] == self.hop_length + ), " [!] upsample scales needs to be equal to hop_length" + self.upsample = UpsampleNetwork( + feat_dims, + upsample_factors, + compute_dims, + res_blocks, + res_out_dims, + pad, + use_aux_net, + ) + else: + self.upsample = Upsample( + hop_length, + pad, + res_blocks, + feat_dims, + compute_dims, + res_out_dims, + use_aux_net, + ) + if self.use_aux_net: + self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims) + self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True) + self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) + self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) + self.fc3 = nn.Linear(fc_dims, self.n_classes) + else: + self.I = nn.Linear(feat_dims + 1, rnn_dims) + self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) + self.fc1 = nn.Linear(rnn_dims, fc_dims) + self.fc2 = nn.Linear(fc_dims, fc_dims) + self.fc3 = nn.Linear(fc_dims, self.n_classes) + + def forward(self, x, mels): + bsize = x.size(0) + h1 = torch.zeros(1, bsize, self.rnn_dims).cuda() + h2 = torch.zeros(1, bsize, self.rnn_dims).cuda() + mels, aux = self.upsample(mels) + + if self.use_aux_net: + aux_idx = [self.aux_dims * i for i in range(5)] + a1 = aux[:, :, aux_idx[0] : aux_idx[1]] + a2 = aux[:, :, aux_idx[1] : aux_idx[2]] + a3 = aux[:, :, aux_idx[2] : aux_idx[3]] + a4 = aux[:, :, aux_idx[3] : aux_idx[4]] + + x = ( + torch.cat([x.unsqueeze(-1), mels, a1], dim=2) + if self.use_aux_net + else torch.cat([x.unsqueeze(-1), mels], dim=2) + ) + x = self.I(x) + res = x + self.rnn1.flatten_parameters() + x, _ = self.rnn1(x, h1) + + x = x + res + res = x + x = torch.cat([x, a2], dim=2) if self.use_aux_net else x + self.rnn2.flatten_parameters() + x, _ = self.rnn2(x, h2) + + x = x + res + x = torch.cat([x, a3], dim=2) if self.use_aux_net else x + x = F.relu(self.fc1(x)) + + x = torch.cat([x, a4], dim=2) if self.use_aux_net else x + x = F.relu(self.fc2(x)) + return self.fc3(x) + + def generate(self, mels, batched, target, overlap): + + self.eval() + output = [] + start = time.time() + rnn1 = self.get_gru_cell(self.rnn1) + rnn2 = self.get_gru_cell(self.rnn2) + + with torch.no_grad(): + + mels = torch.FloatTensor(mels).cuda().unsqueeze(0) + wave_len = (mels.size(-1) - 1) * self.hop_length + mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both") + mels, aux = self.upsample(mels.transpose(1, 2)) + + if batched: + mels = self.fold_with_overlap(mels, target, overlap) + if aux is not None: + aux = self.fold_with_overlap(aux, target, overlap) + + b_size, seq_len, _ = mels.size() + + h1 = torch.zeros(b_size, self.rnn_dims).cuda() + h2 = torch.zeros(b_size, self.rnn_dims).cuda() + x = torch.zeros(b_size, 1).cuda() + + if self.use_aux_net: + d = self.aux_dims + aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] + + for i in range(seq_len): + + m_t = mels[:, i, :] + + if self.use_aux_net: + a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) + + x = ( + torch.cat([x, m_t, a1_t], dim=1) + if self.use_aux_net + else torch.cat([x, m_t], dim=1) + ) + x = self.I(x) + h1 = rnn1(x, h1) + + x = x + h1 + inp = torch.cat([x, a2_t], dim=1) if self.use_aux_net else x + h2 = rnn2(inp, h2) + + x = x + h2 + x = torch.cat([x, a3_t], dim=1) if self.use_aux_net else x + x = F.relu(self.fc1(x)) + + x = torch.cat([x, a4_t], dim=1) if self.use_aux_net else x + x = F.relu(self.fc2(x)) + + logits = self.fc3(x) + + if self.mode == "mold": + sample = sample_from_discretized_mix_logistic( + logits.unsqueeze(0).transpose(1, 2) + ) + output.append(sample.view(-1)) + x = sample.transpose(0, 1).cuda() + elif self.mode == "gauss": + sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) + output.append(sample.view(-1)) + x = sample.transpose(0, 1).cuda() + elif isinstance(self.mode, int): + posterior = F.softmax(logits, dim=1) + distrib = torch.distributions.Categorical(posterior) + + sample = 2 * distrib.sample().float() / (self.n_classes - 1.0) - 1.0 + output.append(sample) + x = sample.unsqueeze(-1) + else: + raise RuntimeError("Unknown model mode value - ", self.mode) + + if i % 100 == 0: + self.gen_display(i, seq_len, b_size, start) + + output = torch.stack(output).transpose(0, 1) + output = output.cpu().numpy() + output = output.astype(np.float64) + + if batched: + output = self.xfade_and_unfold(output, target, overlap) + else: + output = output[0] + + if self.mulaw and isinstance(self.mode, int): + output = ap.mulaw_decode(output, self.mode) + + # Fade-out at the end to avoid signal cutting out suddenly + fade_out = np.linspace(1, 0, 20 * self.hop_length) + output = output[:wave_len] + output[-20 * self.hop_length :] *= fade_out + + self.train() + return output + + def gen_display(self, i, seq_len, b_size, start): + gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 + realtime_ratio = gen_rate * 1000 / self.sample_rate + stream( + "%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ", + (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), + ) + + def get_gru_cell(self, gru): + gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) + gru_cell.weight_hh.data = gru.weight_hh_l0.data + gru_cell.weight_ih.data = gru.weight_ih_l0.data + gru_cell.bias_hh.data = gru.bias_hh_l0.data + gru_cell.bias_ih.data = gru.bias_ih_l0.data + return gru_cell + + def pad_tensor(self, x, pad, side="both"): + # NB - this is just a quick method i need right now + # i.e., it won't generalise to other shapes/dims + b, t, c = x.size() + total = t + 2 * pad if side == "both" else t + pad + padded = torch.zeros(b, total, c).cuda() + if side == "before" or side == "both": + padded[:, pad : pad + t, :] = x + elif side == "after": + padded[:, :t, :] = x + return padded + + def fold_with_overlap(self, x, target, overlap): + + """Fold the tensor with overlap for quick batched inference. + Overlap will be used for crossfading in xfade_and_unfold() + Args: + x (tensor) : Upsampled conditioning features. + shape=(1, timesteps, features) + target (int) : Target timesteps for each index of batch + overlap (int) : Timesteps for both xfade and rnn warmup + Return: + (tensor) : shape=(num_folds, target + 2 * overlap, features) + Details: + x = [[h1, h2, ... hn]] + Where each h is a vector of conditioning features + Eg: target=2, overlap=1 with x.size(1)=10 + folded = [[h1, h2, h3, h4], + [h4, h5, h6, h7], + [h7, h8, h9, h10]] + """ + + _, total_len, features = x.size() + + # Calculate variables needed + num_folds = (total_len - overlap) // (target + overlap) + extended_len = num_folds * (overlap + target) + overlap + remaining = total_len - extended_len + + # Pad if some time steps poking out + if remaining != 0: + num_folds += 1 + padding = target + 2 * overlap - remaining + x = self.pad_tensor(x, padding, side="after") + + folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + + # Get the values for the folded tensor + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + folded[i] = x[:, start:end, :] + + return folded + + def xfade_and_unfold(self, y, target, overlap): + + """Applies a crossfade and unfolds into a 1d array. + Args: + y (ndarry) : Batched sequences of audio samples + shape=(num_folds, target + 2 * overlap) + dtype=np.float64 + overlap (int) : Timesteps for both xfade and rnn warmup + Return: + (ndarry) : audio samples in a 1d array + shape=(total_len) + dtype=np.float64 + Details: + y = [[seq1], + [seq2], + [seq3]] + Apply a gain envelope at both ends of the sequences + y = [[seq1_in, seq1_target, seq1_out], + [seq2_in, seq2_target, seq2_out], + [seq3_in, seq3_target, seq3_out]] + Stagger and add up the groups of samples: + [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...] + """ + + num_folds, length = y.shape + target = length - 2 * overlap + total_len = num_folds * (target + overlap) + overlap + + # Need some silence for the rnn warmup + silence_len = overlap // 2 + fade_len = overlap - silence_len + silence = np.zeros((silence_len), dtype=np.float64) + + # Equal power crossfade + t = np.linspace(-1, 1, fade_len, dtype=np.float64) + fade_in = np.sqrt(0.5 * (1 + t)) + fade_out = np.sqrt(0.5 * (1 - t)) + + # Concat the silence to the fades + fade_in = np.concatenate([silence, fade_in]) + fade_out = np.concatenate([fade_out, silence]) + + # Apply the gain to the overlap samples + y[:, :overlap] *= fade_in + y[:, -overlap:] *= fade_out + + unfolded = np.zeros((total_len), dtype=np.float64) + + # Loop to add up all the samples + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + unfolded[start:end] += y[i] + + return unfolded diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py new file mode 100644 index 00000000..bfcbdd3f --- /dev/null +++ b/TTS/vocoder/utils/distribution.py @@ -0,0 +1,155 @@ +import numpy as np +import math +import torch +from torch.distributions.normal import Normal +import torch.nn.functional as F + + +def gaussian_loss(y_hat, y, log_std_min=-7.0): + assert y_hat.dim() == 3 + assert y_hat.size(2) == 2 + mean = y_hat[:, :, :1] + log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) + # TODO: replace with pytorch dist + log_probs = -0.5 * (- math.log(2.0 * math.pi) - 2. * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std))) + return log_probs.squeeze().mean() + + +def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0): + assert y_hat.size(2) == 2 + mean = y_hat[:, :, :1] + log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) + dist = Normal(mean, torch.exp(log_std), ) + sample = dist.sample() + sample = torch.clamp(torch.clamp(sample, min=-scale_factor), max=scale_factor) + del dist + return sample + + +def log_sum_exp(x): + """ numerically stable log_sum_exp implementation that prevents overflow """ + # TF ordering + axis = len(x.size()) - 1 + m, _ = torch.max(x, dim=axis) + m2, _ = torch.max(x, dim=axis, keepdim=True) + return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis)) + + +# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py +def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, + log_scale_min=None, reduce=True): + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + y_hat = y_hat.permute(0,2,1) + assert y_hat.dim() == 3 + assert y_hat.size(1) % 3 == 0 + nr_mix = y_hat.size(1) // 3 + + # (B x T x C) + y_hat = y_hat.transpose(1, 2) + + # unpack parameters. (B, T, num_mixtures) x 3 + logit_probs = y_hat[:, :, :nr_mix] + means = y_hat[:, :, nr_mix:2 * nr_mix] + log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) + + # B x T x 1 -> B x T x num_mixtures + y = y.expand_as(means) + + centered_y = y - means + inv_stdv = torch.exp(-log_scales) + plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) + cdf_plus = torch.sigmoid(plus_in) + min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) + cdf_min = torch.sigmoid(min_in) + + # log probability for edge case of 0 (before scaling) + # equivalent: torch.log(F.sigmoid(plus_in)) + log_cdf_plus = plus_in - F.softplus(plus_in) + + # log probability for edge case of 255 (before scaling) + # equivalent: (1 - F.sigmoid(min_in)).log() + log_one_minus_cdf_min = -F.softplus(min_in) + + # probability for all other cases + cdf_delta = cdf_plus - cdf_min + + mid_in = inv_stdv * centered_y + # log probability in the center of the bin, to be used in extreme cases + # (not actually used in our code) + log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) + + # tf equivalent + """ + log_probs = tf.where(x < -0.999, log_cdf_plus, + tf.where(x > 0.999, log_one_minus_cdf_min, + tf.where(cdf_delta > 1e-5, + tf.log(tf.maximum(cdf_delta, 1e-12)), + log_pdf_mid - np.log(127.5)))) + """ + # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value + # for num_classes=65536 case? 1e-7? not sure.. + inner_inner_cond = (cdf_delta > 1e-5).float() + + inner_inner_out = inner_inner_cond * \ + torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ + (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) + inner_cond = (y > 0.999).float() + inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out + cond = (y < -0.999).float() + log_probs = cond * log_cdf_plus + (1. - cond) * inner_out + + log_probs = log_probs + F.log_softmax(logit_probs, -1) + + if reduce: + return -torch.mean(log_sum_exp(log_probs)) + else: + return -log_sum_exp(log_probs).unsqueeze(-1) + + +def sample_from_discretized_mix_logistic(y, log_scale_min=None): + """ + Sample from discretized mixture of logistic distributions + Args: + y (Tensor): B x C x T + log_scale_min (float): Log scale minimum value + Returns: + Tensor: sample in range of [-1, 1]. + """ + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + assert y.size(1) % 3 == 0 + nr_mix = y.size(1) // 3 + + # B x T x C + y = y.transpose(1, 2) + logit_probs = y[:, :, :nr_mix] + + # sample mixture indicator from softmax + temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) + temp = logit_probs.data - torch.log(- torch.log(temp)) + _, argmax = temp.max(dim=-1) + + # (B, T) -> (B, T, nr_mix) + one_hot = to_one_hot(argmax, nr_mix) + # select logistic parameters + means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) + log_scales = torch.clamp(torch.sum( + y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) + # sample from logistic & clip to interval + # we don't actually round to the nearest 8bit value when sampling + u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5) + x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) + + x = torch.clamp(torch.clamp(x, min=-1.), max=1.) + + return x + + +def to_one_hot(tensor, n, fill_with=1.): + # we perform one hot encore with respect to the last axis + one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() + if tensor.is_cuda: + one_hot = one_hot.cuda() + one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) + return one_hot From 9a120f28edbb47f771db0b9e48be03a504e895d3 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Fri, 16 Oct 2020 21:19:51 +0200 Subject: [PATCH 04/98] some minor changes to wavernn --- TTS/bin/train_wavernn_vocoder.py | 31 ++++--- TTS/vocoder/configs/wavernn_config.json | 8 +- TTS/vocoder/datasets/wavernn_dataset.py | 11 +-- TTS/vocoder/utils/generic_utils.py | 112 +++++++++++++----------- 4 files changed, 82 insertions(+), 80 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 2f77ab57..e2b8057e 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -13,17 +13,13 @@ import torch from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler - -from TTS.utils.audio import AudioProcessor from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.audio import AudioProcessor +from TTS.utils.radam import RAdam from TTS.utils.io import copy_config_file, load_config -from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -from TTS.utils.tensorboard_logger import TensorboardLogger -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss -from TTS.vocoder.utils.generic_utils import setup_wavernn from TTS.utils.training import setup_torch_training_env from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.generic_utils import ( KeepAverage, count_parameters, @@ -32,6 +28,10 @@ from TTS.utils.generic_utils import ( remove_experiment_folder, set_init_dict, ) +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss +from TTS.vocoder.utils.generic_utils import setup_wavernn from TTS.vocoder.utils.io import save_best_model, save_checkpoint @@ -105,9 +105,7 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): # MODEL TRAINING # ################## y_hat = model(x, m) - y_hat_vis = y_hat # for visualization - # y_hat = y_hat.transpose(1, 2) if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: @@ -200,8 +198,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) # compute spectrograms figures = { - "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), - "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + "prediction": plot_spectrogram(predict_mel.T), + "ground_truth": plot_spectrogram(ground_mel.T), } tb_logger.tb_train_figures(global_step, figures) end_time = time.time() @@ -237,6 +235,7 @@ def evaluate(model, criterion, ap, global_step, epoch): global_step += 1 y_hat = model(x, m) + y_hat_viz = y_hat # for vizualization if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: @@ -266,7 +265,7 @@ def evaluate(model, criterion, ap, global_step, epoch): if epoch > CONFIG.test_delay_epochs: # synthesize a full voice - wav_path = eval_data[random.randrange(0, len(eval_data))][0] + wav_path = train_data[random.randrange(0, len(train_data))][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) sample_wav = model.generate( @@ -283,8 +282,8 @@ def evaluate(model, criterion, ap, global_step, epoch): ) # compute spectrograms figures = { - "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), - "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + "eval/prediction": plot_spectrogram(predict_mel.T), + "eval/ground_truth": plot_spectrogram(ground_mel.T), } tb_logger.tb_eval_figures(global_step, figures) @@ -303,7 +302,6 @@ def main(args): # pylint: disable=redefined-outer-name eval_data, train_data = load_wav_feat_data( CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size ) - eval_data, train_data = eval_data, train_data else: eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) @@ -326,7 +324,8 @@ def main(args): # pylint: disable=redefined-outer-name if isinstance(CONFIG.mode, int): criterion.cuda() - optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + optimizer = RAdam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + scheduler = None if "lr_scheduler" in CONFIG: scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index f7e5d99f..67503aef 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,5 +1,4 @@ { - "model": "wavernn", "run_name": "wavernn_test", "run_description": "wavernn_test training", @@ -54,13 +53,14 @@ "mode": "mold", // mold [string], gauss [string], bits [int] "mulaw": false, // apply mulaw if mode is bits "padding": 2, // pad the input for resnet to see wider input length - + // DATASET + "use_gta": true, // use computed gta features from the tts model "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files - "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant) + "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing computed features .npy (mels / quant) // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. "epochs": 10000, // total number of epochs to train. "warmup_steps": 10, diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index b5a7fdad..8faf5f3c 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -7,8 +7,7 @@ from torch.utils.data import Dataset class WaveRNNDataset(Dataset): """ - WaveRNN Dataset searchs for all the wav files under root path - and converts them to acoustic features on the fly. + WaveRNN Dataset searchs for all the wav files under root path. """ def __init__( @@ -20,8 +19,6 @@ class WaveRNNDataset(Dataset): pad, mode, is_training=True, - return_segments=True, - use_cache=False, verbose=False, ): @@ -32,14 +29,8 @@ class WaveRNNDataset(Dataset): self.pad = pad self.mode = mode self.is_training = is_training - self.return_segments = return_segments - self.use_cache = use_cache self.verbose = verbose - # wav_files = [f"{self.path}wavs/{file}.wav" for file in self.metadata] - # with Pool(4) as pool: - # self.wav_cache = pool.map(self.ap.load_wav, wav_files) - def __len__(self): return len(self.item_list) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 365d0e11..c73c5248 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -39,11 +39,12 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): def to_camel(text): text = text.capitalize() - return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) + return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + def setup_wavernn(c): - print(" > Model: {}".format(c.model)) - MyModel = importlib.import_module('TTS.vocoder.models.wavernn') + print(" > Model: WaveRNN") + MyModel = importlib.import_module("TTS.vocoder.models.wavernn") MyModel = getattr(MyModel, "WaveRNN") model = MyModel( rnn_dims=512, @@ -58,98 +59,109 @@ def setup_wavernn(c): compute_dims=128, res_out_dims=128, res_blocks=10, - hop_length=c.audio['hop_length'], - sample_rate=c.audio['sample_rate']) + hop_length=c.audio["hop_length"], + sample_rate=c.audio["sample_rate"], + ) return model + def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module('TTS.vocoder.models.' + - c.generator_model.lower()) + MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in 'melgan_generator': + if c.generator_model in "melgan_generator": model = MyModel( - in_channels=c.audio['num_mels'], + in_channels=c.audio["num_mels"], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params['upsample_factors'], + upsample_factors=c.generator_model_params["upsample_factors"], res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'melgan_fb_generator': + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + if c.generator_model in "melgan_fb_generator": pass - if c.generator_model in 'multiband_melgan_generator': + if c.generator_model in "multiband_melgan_generator": model = MyModel( - in_channels=c.audio['num_mels'], + in_channels=c.audio["num_mels"], out_channels=4, proj_kernel=7, base_channels=384, - upsample_factors=c.generator_model_params['upsample_factors'], + upsample_factors=c.generator_model_params["upsample_factors"], res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'fullband_melgan_generator': + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + if c.generator_model in "fullband_melgan_generator": model = MyModel( - in_channels=c.audio['num_mels'], + in_channels=c.audio["num_mels"], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params['upsample_factors'], + upsample_factors=c.generator_model_params["upsample_factors"], res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'parallel_wavegan_generator': + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + if c.generator_model in "parallel_wavegan_generator": model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_res_blocks=c.generator_model_params['num_res_blocks'], - stacks=c.generator_model_params['stacks'], + num_res_blocks=c.generator_model_params["num_res_blocks"], + stacks=c.generator_model_params["stacks"], res_channels=64, gate_channels=128, skip_channels=64, - aux_channels=c.audio['num_mels'], + aux_channels=c.audio["num_mels"], dropout=0.0, bias=True, use_weight_norm=True, - upsample_factors=c.generator_model_params['upsample_factors']) + upsample_factors=c.generator_model_params["upsample_factors"], + ) return model def setup_discriminator(c): print(" > Discriminator Model: {}".format(c.discriminator_model)) - if 'parallel_wavegan' in c.discriminator_model: + if "parallel_wavegan" in c.discriminator_model: MyModel = importlib.import_module( - 'TTS.vocoder.models.parallel_wavegan_discriminator') + "TTS.vocoder.models.parallel_wavegan_discriminator" + ) else: - MyModel = importlib.import_module('TTS.vocoder.models.' + - c.discriminator_model.lower()) + MyModel = importlib.import_module( + "TTS.vocoder.models." + c.discriminator_model.lower() + ) MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in 'random_window_discriminator': + if c.discriminator_model in "random_window_discriminator": model = MyModel( - cond_channels=c.audio['num_mels'], - hop_length=c.audio['hop_length'], - uncond_disc_donwsample_factors=c. - discriminator_model_params['uncond_disc_donwsample_factors'], - cond_disc_downsample_factors=c. - discriminator_model_params['cond_disc_downsample_factors'], - cond_disc_out_channels=c. - discriminator_model_params['cond_disc_out_channels'], - window_sizes=c.discriminator_model_params['window_sizes']) - if c.discriminator_model in 'melgan_multiscale_discriminator': + cond_channels=c.audio["num_mels"], + hop_length=c.audio["hop_length"], + uncond_disc_donwsample_factors=c.discriminator_model_params[ + "uncond_disc_donwsample_factors" + ], + cond_disc_downsample_factors=c.discriminator_model_params[ + "cond_disc_downsample_factors" + ], + cond_disc_out_channels=c.discriminator_model_params[ + "cond_disc_out_channels" + ], + window_sizes=c.discriminator_model_params["window_sizes"], + ) + if c.discriminator_model in "melgan_multiscale_discriminator": model = MyModel( in_channels=1, out_channels=1, kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params['base_channels'], - max_channels=c.discriminator_model_params['max_channels'], - downsample_factors=c. - discriminator_model_params['downsample_factors']) - if c.discriminator_model == 'residual_parallel_wavegan_discriminator': + base_channels=c.discriminator_model_params["base_channels"], + max_channels=c.discriminator_model_params["max_channels"], + downsample_factors=c.discriminator_model_params["downsample_factors"], + ) + if c.discriminator_model == "residual_parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params['num_layers'], - stacks=c.discriminator_model_params['stacks'], + num_layers=c.discriminator_model_params["num_layers"], + stacks=c.discriminator_model_params["stacks"], res_channels=64, gate_channels=128, skip_channels=64, @@ -158,17 +170,17 @@ def setup_discriminator(c): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, ) - if c.discriminator_model == 'parallel_wavegan_discriminator': + if c.discriminator_model == "parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params['num_layers'], + num_layers=c.discriminator_model_params["num_layers"], conv_channels=64, dilation_factor=1, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, - bias=True + bias=True, ) return model From 995d84f6d74fa3592fcb7cd5b31f9246155191a8 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Mon, 19 Oct 2020 14:37:30 +0200 Subject: [PATCH 05/98] added feature preprocessing if not set in config --- TTS/bin/train_wavernn_vocoder.py | 64 ++++++++++++++++--------- TTS/vocoder/configs/wavernn_config.json | 11 ++--- TTS/vocoder/datasets/preprocess.py | 25 +++++++++- TTS/vocoder/datasets/wavernn_dataset.py | 1 + 4 files changed, 71 insertions(+), 30 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index e2b8057e..533fe0ce 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -29,7 +29,12 @@ from TTS.utils.generic_utils import ( set_init_dict, ) from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.datasets.preprocess import ( + load_wav_data, + find_feat_files, + load_wav_feat_data, + preprocess_wav_files, +) from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss from TTS.vocoder.utils.generic_utils import setup_wavernn from TTS.vocoder.utils.io import save_best_model, save_checkpoint @@ -192,15 +197,17 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) predict_mel = ap.melspectrogram(sample_wav) - # Sample audio - tb_logger.tb_train_audios( - global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] - ) # compute spectrograms figures = { - "prediction": plot_spectrogram(predict_mel.T), - "ground_truth": plot_spectrogram(ground_mel.T), + "train/ground_truth": plot_spectrogram(ground_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T), } + + # Sample audio + tb_logger.tb_train_audios( + global_step, {"train/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + tb_logger.tb_train_figures(global_step, figures) end_time = time.time() @@ -235,7 +242,6 @@ def evaluate(model, criterion, ap, global_step, epoch): global_step += 1 y_hat = model(x, m) - y_hat_viz = y_hat # for vizualization if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: @@ -263,11 +269,11 @@ def evaluate(model, criterion, ap, global_step, epoch): if CONFIG.print_eval: c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - if epoch > CONFIG.test_delay_epochs: - # synthesize a full voice - wav_path = train_data[random.randrange(0, len(train_data))][0] + if epoch % CONFIG.test_every_epochs == 0: + # synthesize a part of data + wav_path = eval_data[random.randrange(0, len(eval_data))][0] wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav) + ground_mel = ap.melspectrogram(wav[:22000]) sample_wav = model.generate( ground_mel, CONFIG.batched, @@ -276,15 +282,17 @@ def evaluate(model, criterion, ap, global_step, epoch): ) predict_mel = ap.melspectrogram(sample_wav) + # compute spectrograms + figures = { + "eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T), + } + # Sample audio tb_logger.tb_eval_audios( global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] ) - # compute spectrograms - figures = { - "eval/prediction": plot_spectrogram(predict_mel.T), - "eval/ground_truth": plot_spectrogram(ground_mel.T), - } + tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -296,6 +304,9 @@ def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global train_data, eval_data + # setup audio processor + ap = AudioProcessor(**CONFIG.audio) + print(f" > Loading wavs from: {CONFIG.data_path}") if CONFIG.feature_path is not None: print(f" > Loading features from: {CONFIG.feature_path}") @@ -303,11 +314,20 @@ def main(args): # pylint: disable=redefined-outer-name CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size ) else: - eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**CONFIG.audio) - + mel_feat_path = os.path.join(OUT_PATH, "mel") + feat_data = find_feat_files(mel_feat_path) + if feat_data: + print(f" > Loading features from: {mel_feat_path}") + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size + ) + else: + print(f" > No feature data found. Preprocessing...") + # preprocessing feature data from given wav files + preprocess_wav_files(OUT_PATH, CONFIG, ap) + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size + ) # setup model model_wavernn = setup_wavernn(CONFIG) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 67503aef..8e6a8c32 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -55,18 +55,17 @@ "padding": 2, // pad the input for resnet to see wider input length // DATASET - "use_gta": true, // use computed gta features from the tts model - "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files - "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing computed features .npy (mels / quant) + //"use_gta": true, // use computed gta features from the tts model + "data_path": "path/to/wav/files", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them // TRAINING "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. "epochs": 10000, // total number of epochs to train. - "warmup_steps": 10, // VALIDATION "run_eval": true, - "test_delay_epochs": 10, // early testing only wastes computation time. + "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) // OPTIMIZER "grad_clip": 4, // apply gradient clipping if > 0 @@ -90,6 +89,6 @@ "eval_split_size": 50, // number of samples for testing // PATHS - "output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/" + "output_path": "output/training/path" } diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index a5365686..afea45fd 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -1,17 +1,38 @@ import glob import os from pathlib import Path +from tqdm import tqdm import numpy as np +def preprocess_wav_files(out_path, config, ap): + os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) + os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) + wav_files = find_wav_files(config.data_path) + for path in tqdm(wav_files): + wav_name = Path(path).stem + quant_path = os.path.join(out_path, "quant", wav_name + ".npy") + mel_path = os.path.join(out_path, "mel", wav_name + ".npy") + y = ap.load_wav(path) + mel = ap.melspectrogram(y) + np.save(mel_path, mel) + if isinstance(config.mode, int): + quant = ( + ap.mulaw_encode(y, qc=config.mode) + if config.mulaw + else ap.quantize(y, bits=config.mode) + ) + np.save(quant_path, quant) + + def find_wav_files(data_path): - wav_paths = glob.glob(os.path.join(data_path, '**', '*.wav'), recursive=True) + wav_paths = glob.glob(os.path.join(data_path, "**", "*.wav"), recursive=True) return wav_paths def find_feat_files(data_path): - feat_paths = glob.glob(os.path.join(data_path, '**', '*.npy'), recursive=True) + feat_paths = glob.glob(os.path.join(data_path, "**", "*.npy"), recursive=True) return feat_paths diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 8faf5f3c..1b0a8077 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -48,6 +48,7 @@ class WaveRNNDataset(Dataset): feat_path = self.item_list[index] m = np.load(feat_path.replace("/quant/", "/mel/")) if self.mode in ["gauss", "mold"]: + # x = np.load(feat_path.replace("/mel/", "/quant/")) x = self.ap.load_wav(wavpath) elif isinstance(self.mode, int): x = np.load(feat_path.replace("/mel/", "/quant/")) From 64adfbf4a59b9bf0aa21fe3effceed332458bf7b Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Mon, 19 Oct 2020 15:38:32 +0200 Subject: [PATCH 06/98] fixing pylint errors --- TTS/bin/train_wavernn_vocoder.py | 9 ++-- TTS/vocoder/datasets/wavernn_dataset.py | 4 +- TTS/vocoder/models/wavernn.py | 10 ++-- TTS/vocoder/utils/distribution.py | 72 ++++++++++++++----------- 4 files changed, 51 insertions(+), 44 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 533fe0ce..78984510 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -1,8 +1,5 @@ import argparse -import math import os -import pickle -import shutil import sys import traceback import time @@ -11,7 +8,8 @@ import random import torch from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler + +# from torch.utils.data.distributed import DistributedSampler from TTS.tts.utils.visual import plot_spectrogram from TTS.utils.audio import AudioProcessor @@ -30,7 +28,6 @@ from TTS.utils.generic_utils import ( ) from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( - load_wav_data, find_feat_files, load_wav_feat_data, preprocess_wav_files, @@ -322,7 +319,7 @@ def main(args): # pylint: disable=redefined-outer-name CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size ) else: - print(f" > No feature data found. Preprocessing...") + print(" > No feature data found. Preprocessing...") # preprocessing feature data from given wav files preprocess_wav_files(OUT_PATH, CONFIG, ap) eval_data, train_data = load_wav_feat_data( diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 1b0a8077..5d5b9f15 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,5 +1,3 @@ -import os -import glob import torch import numpy as np from torch.utils.data import Dataset @@ -42,7 +40,7 @@ class WaveRNNDataset(Dataset): wavpath, feat_path = self.item_list[index] m = np.load(feat_path.replace("/quant/", "/mel/")) # x = self.wav_cache[index] - if 5 > m.shape[-1]: + if m.shape[-1] < 5: print(" [!] Instance is too short! : {}".format(wavpath)) self.item_list[index] = self.item_list[index + 1] feat_path = self.item_list[index] diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index e1c4365f..9b637a6a 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -42,7 +42,7 @@ class MelResNet(nn.Module): self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() - for i in range(res_blocks): + for _ in range(res_blocks): self.layers.append(ResBlock(compute_dims)) self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) @@ -365,7 +365,8 @@ class WaveRNN(nn.Module): (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), ) - def get_gru_cell(self, gru): + @staticmethod + def get_gru_cell(gru): gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) gru_cell.weight_hh.data = gru.weight_hh_l0.data gru_cell.weight_ih.data = gru.weight_ih_l0.data @@ -373,13 +374,14 @@ class WaveRNN(nn.Module): gru_cell.bias_ih.data = gru.bias_ih_l0.data return gru_cell - def pad_tensor(self, x, pad, side="both"): + @staticmethod + def pad_tensor(x, pad, side="both"): # NB - this is just a quick method i need right now # i.e., it won't generalise to other shapes/dims b, t, c = x.size() total = t + 2 * pad if side == "both" else t + pad padded = torch.zeros(b, total, c).cuda() - if side == "before" or side == "both": + if side in ("before", "both"): padded[:, pad : pad + t, :] = x elif side == "after": padded[:, :t, :] = x diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index bfcbdd3f..705c14dc 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -11,7 +11,11 @@ def gaussian_loss(y_hat, y, log_std_min=-7.0): mean = y_hat[:, :, :1] log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) # TODO: replace with pytorch dist - log_probs = -0.5 * (- math.log(2.0 * math.pi) - 2. * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std))) + log_probs = -0.5 * ( + -math.log(2.0 * math.pi) + - 2.0 * log_std + - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std)) + ) return log_probs.squeeze().mean() @@ -19,7 +23,10 @@ def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0): assert y_hat.size(2) == 2 mean = y_hat[:, :, :1] log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) - dist = Normal(mean, torch.exp(log_std), ) + dist = Normal( + mean, + torch.exp(log_std), + ) sample = dist.sample() sample = torch.clamp(torch.clamp(sample, min=-scale_factor), max=scale_factor) del dist @@ -36,11 +43,12 @@ def log_sum_exp(x): # It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py -def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, - log_scale_min=None, reduce=True): +def discretized_mix_logistic_loss( + y_hat, y, num_classes=65536, log_scale_min=None, reduce=True +): if log_scale_min is None: log_scale_min = float(np.log(1e-14)) - y_hat = y_hat.permute(0,2,1) + y_hat = y_hat.permute(0, 2, 1) assert y_hat.dim() == 3 assert y_hat.size(1) % 3 == 0 nr_mix = y_hat.size(1) // 3 @@ -50,17 +58,17 @@ def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, # unpack parameters. (B, T, num_mixtures) x 3 logit_probs = y_hat[:, :, :nr_mix] - means = y_hat[:, :, nr_mix:2 * nr_mix] - log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) + means = y_hat[:, :, nr_mix : 2 * nr_mix] + log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min) # B x T x 1 -> B x T x num_mixtures y = y.expand_as(means) centered_y = y - means inv_stdv = torch.exp(-log_scales) - plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) + plus_in = inv_stdv * (centered_y + 1.0 / (num_classes - 1)) cdf_plus = torch.sigmoid(plus_in) - min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) + min_in = inv_stdv * (centered_y - 1.0 / (num_classes - 1)) cdf_min = torch.sigmoid(min_in) # log probability for edge case of 0 (before scaling) @@ -77,34 +85,35 @@ def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, mid_in = inv_stdv * centered_y # log probability in the center of the bin, to be used in extreme cases # (not actually used in our code) - log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) + log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in) # tf equivalent - """ - log_probs = tf.where(x < -0.999, log_cdf_plus, - tf.where(x > 0.999, log_one_minus_cdf_min, - tf.where(cdf_delta > 1e-5, - tf.log(tf.maximum(cdf_delta, 1e-12)), - log_pdf_mid - np.log(127.5)))) - """ + + # log_probs = tf.where(x < -0.999, log_cdf_plus, + # tf.where(x > 0.999, log_one_minus_cdf_min, + # tf.where(cdf_delta > 1e-5, + # tf.log(tf.maximum(cdf_delta, 1e-12)), + # log_pdf_mid - np.log(127.5)))) + # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value # for num_classes=65536 case? 1e-7? not sure.. inner_inner_cond = (cdf_delta > 1e-5).float() - inner_inner_out = inner_inner_cond * \ - torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ - (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) + inner_inner_out = inner_inner_cond * torch.log( + torch.clamp(cdf_delta, min=1e-12) + ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) inner_cond = (y > 0.999).float() - inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out + inner_out = ( + inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out + ) cond = (y < -0.999).float() - log_probs = cond * log_cdf_plus + (1. - cond) * inner_out + log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out log_probs = log_probs + F.log_softmax(logit_probs, -1) if reduce: return -torch.mean(log_sum_exp(log_probs)) - else: - return -log_sum_exp(log_probs).unsqueeze(-1) + return -log_sum_exp(log_probs).unsqueeze(-1) def sample_from_discretized_mix_logistic(y, log_scale_min=None): @@ -127,26 +136,27 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): # sample mixture indicator from softmax temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) - temp = logit_probs.data - torch.log(- torch.log(temp)) + temp = logit_probs.data - torch.log(-torch.log(temp)) _, argmax = temp.max(dim=-1) # (B, T) -> (B, T, nr_mix) one_hot = to_one_hot(argmax, nr_mix) # select logistic parameters - means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) - log_scales = torch.clamp(torch.sum( - y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) + means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1) + log_scales = torch.clamp( + torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min + ) # sample from logistic & clip to interval # we don't actually round to the nearest 8bit value when sampling u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5) - x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) + x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1.0 - u)) - x = torch.clamp(torch.clamp(x, min=-1.), max=1.) + x = torch.clamp(torch.clamp(x, min=-1.0), max=1.0) return x -def to_one_hot(tensor, n, fill_with=1.): +def to_one_hot(tensor, n, fill_with=1.0): # we perform one hot encore with respect to the last axis one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() if tensor.is_cuda: From 2ca63c013f88caac2444b32fdb1b2e77d001e430 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 19 Oct 2020 15:47:12 +0200 Subject: [PATCH 07/98] fix no loss masking loss computation --- TTS/tts/layers/losses.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index f07851ac..8256c0f7 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -2,6 +2,7 @@ import math import numpy as np import torch from torch import nn +from inspect import signature from torch.nn import functional from TTS.tts.utils.generic_utils import sequence_mask @@ -142,7 +143,11 @@ class DifferentailSpectralLoss(nn.Module): def forward(self, x, target, length): x_diff = x[:, 1:] - x[:, :-1] target_diff = target[:, 1:] - target[:, :-1] - return self.loss_func(x_diff, target_diff, length-1) + if len(signature(self.loss_func).parameters) > 2: + return self.loss_func(x_diff, target_diff, length-1) + else: + # if loss masking is not enabled + return self.loss_func(x_diff, target_diff) class GuidedAttentionLoss(torch.nn.Module): @@ -262,8 +267,11 @@ class TacotronLoss(torch.nn.Module): # double decoder consistency loss (if enabled) if self.config.double_decoder_consistency: - decoder_b_loss = self.criterion(decoder_b_output, mel_input, - output_lens) + if self.config.loss_masking: + decoder_b_loss = self.criterion(decoder_b_output, mel_input, + output_lens) + else: + decoder_b_loss = self.criterion(decoder_b_output, mel_input) # decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output) attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards) loss += self.decoder_alpha * (decoder_b_loss + attention_c_loss) From 24d18d20e34c331bc0d3a067de66389c98a9b03c Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Mon, 19 Oct 2020 16:20:15 +0200 Subject: [PATCH 08/98] fix formatting + pylint --- TTS/bin/compute_statistics.py | 52 ++--- TTS/bin/train_gan_vocoder.py | 344 +++++++++++++---------------- TTS/vocoder/models/wavernn.py | 47 ++-- TTS/vocoder/utils/distribution.py | 15 +- TTS/vocoder/utils/generic_utils.py | 101 ++++----- 5 files changed, 252 insertions(+), 307 deletions(-) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 9177c75b..ca089d3e 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -15,23 +15,17 @@ from TTS.utils.audio import AudioProcessor def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( - description="Compute mean and variance of spectrogtram features." - ) - parser.add_argument( - "--config_path", - type=str, - required=True, - help="TTS config file path to define audio processin parameters.", - ) - parser.add_argument( - "--out_path", default=None, type=str, help="directory to save the output file." - ) + description="Compute mean and variance of spectrogtram features.") + parser.add_argument("--config_path", type=str, required=True, + help="TTS config file path to define audio processin parameters.") + parser.add_argument("--out_path", default=None, type=str, + help="directory to save the output file.") args = parser.parse_args() # load config CONFIG = load_config(args.config_path) - CONFIG.audio["signal_norm"] = False # do not apply earlier normalization - CONFIG.audio["stats_path"] = None # discard pre-defined stats + CONFIG.audio['signal_norm'] = False # do not apply earlier normalization + CONFIG.audio['stats_path'] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) @@ -65,27 +59,27 @@ def main(): output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} - stats["mel_mean"] = mel_mean - stats["mel_std"] = mel_scale - stats["linear_mean"] = linear_mean - stats["linear_std"] = linear_scale + stats['mel_mean'] = mel_mean + stats['mel_std'] = mel_scale + stats['linear_mean'] = linear_mean + stats['linear_std'] = linear_scale - print(f" > Avg mel spec mean: {mel_mean.mean()}") - print(f" > Avg mel spec scale: {mel_scale.mean()}") - print(f" > Avg linear spec mean: {linear_mean.mean()}") - print(f" > Avg lienar spec scale: {linear_scale.mean()}") + print(f' > Avg mel spec mean: {mel_mean.mean()}') + print(f' > Avg mel spec scale: {mel_scale.mean()}') + print(f' > Avg linear spec mean: {linear_mean.mean()}') + print(f' > Avg lienar spec scale: {linear_scale.mean()}') # set default config values for mean-var scaling - CONFIG.audio["stats_path"] = output_file_path - CONFIG.audio["signal_norm"] = True + CONFIG.audio['stats_path'] = output_file_path + CONFIG.audio['signal_norm'] = True # remove redundant values - del CONFIG.audio["max_norm"] - del CONFIG.audio["min_level_db"] - del CONFIG.audio["symmetric_norm"] - del CONFIG.audio["clip_norm"] - stats["audio_config"] = CONFIG.audio + del CONFIG.audio['max_norm'] + del CONFIG.audio['min_level_db'] + del CONFIG.audio['symmetric_norm'] + del CONFIG.audio['clip_norm'] + stats['audio_config'] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) - print(f" > scale_stats.npy is saved to {output_file_path}") + print(f' > scale_stats.npy is saved to {output_file_path}') if __name__ == "__main__": diff --git a/TTS/bin/train_gan_vocoder.py b/TTS/bin/train_gan_vocoder.py index 7689c930..12edf048 100644 --- a/TTS/bin/train_gan_vocoder.py +++ b/TTS/bin/train_gan_vocoder.py @@ -10,29 +10,20 @@ import torch from torch.utils.data import DataLoader from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.generic_utils import ( - KeepAverage, - count_parameters, - create_experiment_folder, - get_git_branch, - remove_experiment_folder, - set_init_dict, -) +from TTS.utils.generic_utils import (KeepAverage, count_parameters, + create_experiment_folder, get_git_branch, + remove_experiment_folder, set_init_dict) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data - # from distribute import (DistributedSampler, apply_gradient_allreduce, # init_distributed, reduce_tensor) from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import ( - plot_results, - setup_discriminator, - setup_generator, -) +from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, + setup_generator) from TTS.vocoder.utils.io import save_best_model, save_checkpoint use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -42,30 +33,27 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not c.run_eval: loader = None else: - dataset = GANDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose, - ) + dataset = GANDataset(ap=ap, + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=not is_val, + use_noise_augment=c.use_noise_augment, + use_cache=c.use_cache, + verbose=verbose) dataset.shuffle_mapping() # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=True, - drop_last=False, - sampler=None, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=False, - ) + loader = DataLoader(dataset, + batch_size=1 if is_val else c.batch_size, + shuffle=True, + drop_last=False, + sampler=None, + num_workers=c.num_val_loader_workers + if is_val else c.num_loader_workers, + pin_memory=False) return loader @@ -92,26 +80,16 @@ def format_data(data): return co, x, None, None -def train( - model_G, - criterion_G, - optimizer_G, - model_D, - criterion_D, - optimizer_D, - scheduler_G, - scheduler_D, - ap, - global_step, - epoch, -): +def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, + scheduler_G, scheduler_D, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model_G.train() model_D.train() epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) + batch_n_iter = int( + len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() @@ -167,16 +145,16 @@ def train( scores_fake = D_out_fake # compute losses - loss_G_dict = criterion_G( - y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub - ) - loss_G = loss_G_dict["G_loss"] + loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, + feats_real, y_hat_sub, y_G_sub) + loss_G = loss_G_dict['G_loss'] # optimizer generator optimizer_G.zero_grad() loss_G.backward() if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) + torch.nn.utils.clip_grad_norm_(model_G.parameters(), + c.gen_clip_grad) optimizer_G.step() if scheduler_G is not None: scheduler_G.step() @@ -221,13 +199,14 @@ def train( # compute losses loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict["D_loss"] + loss_D = loss_D_dict['D_loss'] # optimizer discriminator optimizer_D.zero_grad() loss_D.backward() if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) + torch.nn.utils.clip_grad_norm_(model_D.parameters(), + c.disc_clip_grad) optimizer_D.step() if scheduler_D is not None: scheduler_D.step() @@ -242,40 +221,34 @@ def train( epoch_time += step_time # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]["lr"] - current_lr_D = list(optimizer_D.param_groups)[0]["lr"] + current_lr_G = list(optimizer_G.param_groups)[0]['lr'] + current_lr_D = list(optimizer_D.param_groups)[0]['lr'] # update avg stats update_train_values = dict() for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time + update_train_values['avg_' + key] = value + update_train_values['avg_loader_time'] = loader_time + update_train_values['avg_step_time'] = step_time keep_avg.update_values(update_train_values) # print training stats if global_step % c.print_step == 0: log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], + 'step_time': [step_time, 2], + 'loader_time': [loader_time, 4], "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D, + "current_lr_D": current_lr_D } - c_logger.print_train_step( - batch_n_iter, - num_iter, - global_step, - log_dict, - loss_dict, - keep_avg.avg_values, - ) + c_logger.print_train_step(batch_n_iter, num_iter, global_step, + log_dict, loss_dict, keep_avg.avg_values) # plot step stats if global_step % 10 == 0: iter_stats = { "lr_G": current_lr_G, "lr_D": current_lr_D, - "step_time": step_time, + "step_time": step_time } iter_stats.update(loss_dict) tb_logger.tb_train_iter_stats(global_step, iter_stats) @@ -284,28 +257,27 @@ def train( if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint( - model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - ) + save_checkpoint(model_G, + optimizer_G, + scheduler_G, + model_D, + optimizer_D, + scheduler_D, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict) # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") + figures = plot_results(y_hat_vis, y_G, ap, global_step, + 'train') tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios( - global_step, {"train/audio": sample_voice}, c.audio["sample_rate"] - ) + tb_logger.tb_train_audios(global_step, + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -379,9 +351,8 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) feats_fake, feats_real = None, None # compute losses - loss_G_dict = criterion_G( - y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub - ) + loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, + feats_real, y_hat_sub, y_G_sub) loss_dict = dict() for key, value in loss_G_dict.items(): @@ -437,9 +408,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) # update avg stats update_eval_values = dict() for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time + update_eval_values['avg_' + key] = value + update_eval_values['avg_loader_time'] = loader_time + update_eval_values['avg_step_time'] = step_time keep_avg.update_values(update_eval_values) # print eval stats @@ -447,14 +418,13 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, "eval") + figures = plot_results(y_hat, y_G, ap, global_step, 'eval') tb_logger.tb_eval_figures(global_step, figures) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios( - global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"] - ) + tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, + c.audio["sample_rate"]) # synthesize a full voice data_loader.return_segments = False @@ -472,8 +442,7 @@ def main(args): # pylint: disable=redefined-outer-name if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") eval_data, train_data = load_wav_feat_data( - c.data_path, c.feature_path, c.eval_split_size - ) + c.data_path, c.feature_path, c.eval_split_size) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -491,63 +460,68 @@ def main(args): # pylint: disable=redefined-outer-name # setup optimizers optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0) - optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0) + optimizer_disc = RAdam(model_disc.parameters(), + lr=c.lr_disc, + weight_decay=0) # schedulers scheduler_gen = None scheduler_disc = None - if "lr_scheduler_gen" in c: + if 'lr_scheduler_gen' in c: scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) - scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if "lr_scheduler_disc" in c: + scheduler_gen = scheduler_gen( + optimizer_gen, **c.lr_scheduler_gen_params) + if 'lr_scheduler_disc' in c: scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) - scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) + scheduler_disc = scheduler_disc( + optimizer_disc, **c.lr_scheduler_disc_params) # setup criterion criterion_gen = GeneratorLoss(c) criterion_disc = DiscriminatorLoss(c) if args.restore_path: - checkpoint = torch.load(args.restore_path, map_location="cpu") + checkpoint = torch.load(args.restore_path, map_location='cpu') try: print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint["model"]) + model_gen.load_state_dict(checkpoint['model']) print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint["optimizer"]) + optimizer_gen.load_state_dict(checkpoint['optimizer']) print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint["model_disc"]) + model_disc.load_state_dict(checkpoint['model_disc']) print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) - if "scheduler" in checkpoint: + optimizer_disc.load_state_dict(checkpoint['optimizer_disc']) + if 'scheduler' in checkpoint: print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint["scheduler"]) + scheduler_gen.load_state_dict(checkpoint['scheduler']) # NOTE: Not sure if necessary scheduler_gen.optimizer = optimizer_gen - if "scheduler_disc" in checkpoint: + if 'scheduler_disc' in checkpoint: print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) + scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) scheduler_disc.optimizer = optimizer_disc except RuntimeError: # retore only matching layers. print(" > Partial model initialization...") model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) + model_dict = set_init_dict(model_dict, checkpoint['model'], c) model_gen.load_state_dict(model_dict) model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) + model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c) model_disc.load_state_dict(model_dict) del model_dict # reset lr if not countinuining training. for group in optimizer_gen.param_groups: - group["lr"] = c.lr_gen + group['lr'] = c.lr_gen for group in optimizer_disc.param_groups: - group["lr"] = c.lr_disc + group['lr'] = c.lr_disc - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] + print(" > Model restored from step %d" % checkpoint['step'], + flush=True) + args.restore_step = checkpoint['step'] else: args.restore_step = 0 @@ -566,92 +540,74 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model_disc) print(" > Discriminator has {} parameters".format(num_params), flush=True) - if "best_loss" not in locals(): - best_loss = float("inf") + if 'best_loss' not in locals(): + best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train( - model_gen, - criterion_gen, - optimizer_gen, - model_disc, - criterion_disc, - optimizer_disc, - scheduler_gen, - scheduler_disc, - ap, - global_step, - epoch, - ) - eval_avg_loss_dict = evaluate( - model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch - ) + _, global_step = train(model_gen, criterion_gen, optimizer_gen, + model_disc, criterion_disc, optimizer_disc, + scheduler_gen, scheduler_disc, ap, global_step, + epoch) + eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict, - ) + best_loss = save_best_model(target_loss, + best_loss, + model_gen, + optimizer_gen, + scheduler_gen, + model_disc, + optimizer_disc, + scheduler_disc, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict) -if __name__ == "__main__": +if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( - "--continue_path", + '--continue_path', type=str, help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default="", - required="--config_path" not in sys.argv, - ) + default='', + required='--config_path' not in sys.argv) parser.add_argument( - "--restore_path", + '--restore_path', type=str, - help="Model file to be restored. Use to finetune a model.", - default="", - ) - parser.add_argument( - "--config_path", - type=str, - help="Path to config file for training.", - required="--continue_path" not in sys.argv, - ) - parser.add_argument( - "--debug", - type=bool, - default=False, - help="Do not verify commit integrity to run training.", - ) + help='Model file to be restored. Use to finetune a model.', + default='') + parser.add_argument('--config_path', + type=str, + help='Path to config file for training.', + required='--continue_path' not in sys.argv) + parser.add_argument('--debug', + type=bool, + default=False, + help='Do not verify commit integrity to run training.') # DISTRUBUTED parser.add_argument( - "--rank", + '--rank', type=int, default=0, - help="DISTRIBUTED: process rank for distributed training.", - ) - parser.add_argument( - "--group_id", type=str, default="", help="DISTRIBUTED: process group id." - ) + help='DISTRIBUTED: process rank for distributed training.') + parser.add_argument('--group_id', + type=str, + default="", + help='DISTRIBUTED: process group id.') args = parser.parse_args() - if args.continue_path != "": + if args.continue_path != '': args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, "config.json") + args.config_path = os.path.join(args.continue_path, 'config.json') list_of_files = glob.glob( - args.continue_path + "/*.pth.tar" - ) # * means all if need specific format then *.csv + args.continue_path + + "/*.pth.tar") # * means all if need specific format then *.csv latest_model_file = max(list_of_files, key=os.path.getctime) args.restore_path = latest_model_file print(f" > Training continues for {args.restore_path}") @@ -662,10 +618,11 @@ if __name__ == "__main__": _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path - if args.continue_path == "": - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) + if args.continue_path == '': + OUT_PATH = create_experiment_folder(c.output_path, c.run_name, + args.debug) - AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") + AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') c_logger = ConsoleLogger() @@ -675,17 +632,16 @@ if __name__ == "__main__": if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_config_file( - args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields - ) + copy_config_file(args.config_path, + os.path.join(OUT_PATH, 'config.json'), new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") + tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') # write model desc to tensorboard - tb_logger.tb_add_text("model-description", c["run_description"], 0) + tb_logger.tb_add_text('model-description', c['run_description'], 0) try: main(args) @@ -698,4 +654,4 @@ if __name__ == "__main__": except Exception: # pylint: disable=broad-except remove_experiment_folder(OUT_PATH) traceback.print_exc() - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 9b637a6a..4d1a633c 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -365,28 +365,6 @@ class WaveRNN(nn.Module): (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), ) - @staticmethod - def get_gru_cell(gru): - gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) - gru_cell.weight_hh.data = gru.weight_hh_l0.data - gru_cell.weight_ih.data = gru.weight_ih_l0.data - gru_cell.bias_hh.data = gru.bias_hh_l0.data - gru_cell.bias_ih.data = gru.bias_ih_l0.data - return gru_cell - - @staticmethod - def pad_tensor(x, pad, side="both"): - # NB - this is just a quick method i need right now - # i.e., it won't generalise to other shapes/dims - b, t, c = x.size() - total = t + 2 * pad if side == "both" else t + pad - padded = torch.zeros(b, total, c).cuda() - if side in ("before", "both"): - padded[:, pad : pad + t, :] = x - elif side == "after": - padded[:, :t, :] = x - return padded - def fold_with_overlap(self, x, target, overlap): """Fold the tensor with overlap for quick batched inference. @@ -430,7 +408,30 @@ class WaveRNN(nn.Module): return folded - def xfade_and_unfold(self, y, target, overlap): + @staticmethod + def get_gru_cell(gru): + gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) + gru_cell.weight_hh.data = gru.weight_hh_l0.data + gru_cell.weight_ih.data = gru.weight_ih_l0.data + gru_cell.bias_hh.data = gru.bias_hh_l0.data + gru_cell.bias_ih.data = gru.bias_ih_l0.data + return gru_cell + + @staticmethod + def pad_tensor(x, pad, side="both"): + # NB - this is just a quick method i need right now + # i.e., it won't generalise to other shapes/dims + b, t, c = x.size() + total = t + 2 * pad if side == "both" else t + pad + padded = torch.zeros(b, total, c).cuda() + if side in ("before", "both"): + padded[:, pad : pad + t, :] = x + elif side == "after": + padded[:, :t, :] = x + return padded + + @staticmethod + def xfade_and_unfold(y, target, overlap): """Applies a crossfade and unfolds into a 1d array. Args: diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index 705c14dc..6aba5e34 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -28,7 +28,8 @@ def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0): torch.exp(log_std), ) sample = dist.sample() - sample = torch.clamp(torch.clamp(sample, min=-scale_factor), max=scale_factor) + sample = torch.clamp(torch.clamp( + sample, min=-scale_factor), max=scale_factor) del dist return sample @@ -58,8 +59,9 @@ def discretized_mix_logistic_loss( # unpack parameters. (B, T, num_mixtures) x 3 logit_probs = y_hat[:, :, :nr_mix] - means = y_hat[:, :, nr_mix : 2 * nr_mix] - log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min) + means = y_hat[:, :, nr_mix: 2 * nr_mix] + log_scales = torch.clamp( + y_hat[:, :, 2 * nr_mix: 3 * nr_mix], min=log_scale_min) # B x T x 1 -> B x T x num_mixtures y = y.expand_as(means) @@ -104,7 +106,8 @@ def discretized_mix_logistic_loss( ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) inner_cond = (y > 0.999).float() inner_out = ( - inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out + inner_cond * log_one_minus_cdf_min + + (1.0 - inner_cond) * inner_inner_out ) cond = (y < -0.999).float() log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out @@ -142,9 +145,9 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): # (B, T) -> (B, T, nr_mix) one_hot = to_one_hot(argmax, nr_mix) # select logistic parameters - means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1) + means = torch.sum(y[:, :, nr_mix: 2 * nr_mix] * one_hot, dim=-1) log_scales = torch.clamp( - torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min + torch.sum(y[:, :, 2 * nr_mix: 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min ) # sample from logistic & clip to interval # we don't actually round to the nearest 8bit value when sampling diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index c73c5248..c16fa1ae 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -39,7 +39,7 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): def to_camel(text): text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) def setup_wavernn(c): @@ -67,101 +67,92 @@ def setup_wavernn(c): def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) + MyModel = importlib.import_module('TTS.vocoder.models.' + + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in "melgan_generator": + if c.generator_model in 'melgan_generator': model = MyModel( - in_channels=c.audio["num_mels"], + in_channels=c.audio['num_mels'], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], + upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "melgan_fb_generator": + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'melgan_fb_generator': pass - if c.generator_model in "multiband_melgan_generator": + if c.generator_model in 'multiband_melgan_generator': model = MyModel( - in_channels=c.audio["num_mels"], + in_channels=c.audio['num_mels'], out_channels=4, proj_kernel=7, base_channels=384, - upsample_factors=c.generator_model_params["upsample_factors"], + upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "fullband_melgan_generator": + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'fullband_melgan_generator': model = MyModel( - in_channels=c.audio["num_mels"], + in_channels=c.audio['num_mels'], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], + upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "parallel_wavegan_generator": + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'parallel_wavegan_generator': model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - stacks=c.generator_model_params["stacks"], + num_res_blocks=c.generator_model_params['num_res_blocks'], + stacks=c.generator_model_params['stacks'], res_channels=64, gate_channels=128, skip_channels=64, - aux_channels=c.audio["num_mels"], + aux_channels=c.audio['num_mels'], dropout=0.0, bias=True, use_weight_norm=True, - upsample_factors=c.generator_model_params["upsample_factors"], - ) + upsample_factors=c.generator_model_params['upsample_factors']) return model def setup_discriminator(c): print(" > Discriminator Model: {}".format(c.discriminator_model)) - if "parallel_wavegan" in c.discriminator_model: + if 'parallel_wavegan' in c.discriminator_model: MyModel = importlib.import_module( - "TTS.vocoder.models.parallel_wavegan_discriminator" - ) + 'TTS.vocoder.models.parallel_wavegan_discriminator') else: - MyModel = importlib.import_module( - "TTS.vocoder.models." + c.discriminator_model.lower() - ) + MyModel = importlib.import_module('TTS.vocoder.models.' + + c.discriminator_model.lower()) MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in "random_window_discriminator": + if c.discriminator_model in 'random_window_discriminator': model = MyModel( - cond_channels=c.audio["num_mels"], - hop_length=c.audio["hop_length"], - uncond_disc_donwsample_factors=c.discriminator_model_params[ - "uncond_disc_donwsample_factors" - ], - cond_disc_downsample_factors=c.discriminator_model_params[ - "cond_disc_downsample_factors" - ], - cond_disc_out_channels=c.discriminator_model_params[ - "cond_disc_out_channels" - ], - window_sizes=c.discriminator_model_params["window_sizes"], - ) - if c.discriminator_model in "melgan_multiscale_discriminator": + cond_channels=c.audio['num_mels'], + hop_length=c.audio['hop_length'], + uncond_disc_donwsample_factors=c. + discriminator_model_params['uncond_disc_donwsample_factors'], + cond_disc_downsample_factors=c. + discriminator_model_params['cond_disc_downsample_factors'], + cond_disc_out_channels=c. + discriminator_model_params['cond_disc_out_channels'], + window_sizes=c.discriminator_model_params['window_sizes']) + if c.discriminator_model in 'melgan_multiscale_discriminator': model = MyModel( in_channels=1, out_channels=1, kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params["base_channels"], - max_channels=c.discriminator_model_params["max_channels"], - downsample_factors=c.discriminator_model_params["downsample_factors"], - ) - if c.discriminator_model == "residual_parallel_wavegan_discriminator": + base_channels=c.discriminator_model_params['base_channels'], + max_channels=c.discriminator_model_params['max_channels'], + downsample_factors=c. + discriminator_model_params['downsample_factors']) + if c.discriminator_model == 'residual_parallel_wavegan_discriminator': model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], - stacks=c.discriminator_model_params["stacks"], + num_layers=c.discriminator_model_params['num_layers'], + stacks=c.discriminator_model_params['stacks'], res_channels=64, gate_channels=128, skip_channels=64, @@ -170,17 +161,17 @@ def setup_discriminator(c): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, ) - if c.discriminator_model == "parallel_wavegan_discriminator": + if c.discriminator_model == 'parallel_wavegan_discriminator': model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], + num_layers=c.discriminator_model_params['num_layers'], conv_channels=64, dilation_factor=1, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, - bias=True, + bias=True ) return model From b7f9ebd32be758c59649f9ec489bc2bd49840ff4 Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 19 Oct 2020 17:17:58 -0300 Subject: [PATCH 09/98] add check arguments for GlowTTS and multispeaker training bug fix --- TTS/bin/train_glow_tts.py | 3 +- TTS/tts/models/glow_tts.py | 12 +++++-- TTS/tts/utils/generic_utils.py | 59 ++++++++++++++++++---------------- 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 3d34d978..c5e570e5 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -15,7 +15,7 @@ from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import GlowTTSLoss from TTS.tts.utils.distribute import (DistributedSampler, init_distributed, reduce_tensor) -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.utils.generic_utils import setup_model, check_config_tts from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import (get_speakers, load_speaker_mapping, @@ -602,6 +602,7 @@ if __name__ == '__main__': # setup output paths and read configs c = load_config(args.config_path) # check_config(c) + check_config_tts(c) _ = os.path.dirname(os.path.realpath(__file__)) if c.apex_amp_level: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index 902de699..a9b6f8c0 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -37,7 +37,8 @@ class GlowTts(nn.Module): hidden_channels_enc=None, hidden_channels_dec=None, use_encoder_prenet=False, - encoder_type="transformer"): + encoder_type="transformer", + external_speaker_embedding_dim=None): super().__init__() self.num_chars = num_chars @@ -68,6 +69,13 @@ class GlowTts(nn.Module): self.noise_scale = 0.66 self.length_scale = 1. + # if is a multispeaker and c_in_channels is 0, set to 256 + if num_speakers > 1: + if self.c_in_channels == 0 and not external_speaker_embedding_dim: + self.c_in_channels = 256 + elif external_speaker_embedding_dim: + self.c_in_channels = external_speaker_embedding_dim + self.encoder = Encoder(num_chars, out_channels=out_channels, hidden_channels=hidden_channels, @@ -94,7 +102,7 @@ class GlowTts(nn.Module): sigmoid_scale=sigmoid_scale, c_in_channels=c_in_channels) - if num_speakers > 1: + if num_speakers > 1 and not external_speaker_embedding_dim: self.emb_g = nn.Embedding(num_speakers, c_in_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index 5480cbcd..aacac898 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -129,10 +129,11 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): use_encoder_prenet=True) return model - +def is_tacotron(c): + return False if c['model'] == 'glow_tts' else True def check_config_tts(c): - check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) + check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts'], restricted=True, val_type=str) check_argument('run_name', c, restricted=True, val_type=str) check_argument('run_description', c, val_type=str) @@ -195,27 +196,30 @@ def check_config_tts(c): check_argument('seq_len_norm', c, restricted=True, val_type=bool) # tacotron prenet - check_argument('memory_size', c, restricted=True, val_type=int, min_val=-1) - check_argument('prenet_type', c, restricted=True, val_type=str, enum_list=['original', 'bn']) - check_argument('prenet_dropout', c, restricted=True, val_type=bool) + check_argument('memory_size', c, restricted=is_tacotron(c), val_type=int, min_val=-1) + check_argument('prenet_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['original', 'bn']) + check_argument('prenet_dropout', c, restricted=is_tacotron(c), val_type=bool) # attention - check_argument('attention_type', c, restricted=True, val_type=str, enum_list=['graves', 'original']) - check_argument('attention_heads', c, restricted=True, val_type=int) - check_argument('attention_norm', c, restricted=True, val_type=str, enum_list=['sigmoid', 'softmax']) - check_argument('windowing', c, restricted=True, val_type=bool) - check_argument('use_forward_attn', c, restricted=True, val_type=bool) - check_argument('forward_attn_mask', c, restricted=True, val_type=bool) - check_argument('transition_agent', c, restricted=True, val_type=bool) - check_argument('transition_agent', c, restricted=True, val_type=bool) - check_argument('location_attn', c, restricted=True, val_type=bool) - check_argument('bidirectional_decoder', c, restricted=True, val_type=bool) - check_argument('double_decoder_consistency', c, restricted=True, val_type=bool) + check_argument('attention_type', c, restricted=is_tacotron(c), val_type=str, enum_list=['graves', 'original']) + check_argument('attention_heads', c, restricted=is_tacotron(c), val_type=int) + check_argument('attention_norm', c, restricted=is_tacotron(c), val_type=str, enum_list=['sigmoid', 'softmax']) + check_argument('windowing', c, restricted=is_tacotron(c), val_type=bool) + check_argument('use_forward_attn', c, restricted=is_tacotron(c), val_type=bool) + check_argument('forward_attn_mask', c, restricted=is_tacotron(c), val_type=bool) + check_argument('transition_agent', c, restricted=is_tacotron(c), val_type=bool) + check_argument('transition_agent', c, restricted=is_tacotron(c), val_type=bool) + check_argument('location_attn', c, restricted=is_tacotron(c), val_type=bool) + check_argument('bidirectional_decoder', c, restricted=is_tacotron(c), val_type=bool) + check_argument('double_decoder_consistency', c, restricted=is_tacotron(c), val_type=bool) check_argument('ddc_r', c, restricted='double_decoder_consistency' in c.keys(), min_val=1, max_val=7, val_type=int) # stopnet - check_argument('stopnet', c, restricted=True, val_type=bool) - check_argument('separate_stopnet', c, restricted=True, val_type=bool) + check_argument('stopnet', c, restricted=is_tacotron(c), val_type=bool) + check_argument('separate_stopnet', c, restricted=is_tacotron(c), val_type=bool) + + # GlowTTS parameters + check_argument('encoder_type', c, restricted=not is_tacotron(c), val_type=str) # tensorboard check_argument('print_step', c, restricted=True, val_type=int, min_val=1) @@ -240,15 +244,16 @@ def check_config_tts(c): # multi-speaker and gst check_argument('use_speaker_embedding', c, restricted=True, val_type=bool) - check_argument('use_external_speaker_embedding_file', c, restricted=True, val_type=bool) - check_argument('external_speaker_embedding_file', c, restricted=True, val_type=str) - check_argument('use_gst', c, restricted=True, val_type=bool) - check_argument('gst', c, restricted=True, val_type=dict) - check_argument('gst_style_input', c['gst'], restricted=True, val_type=[str, dict]) - check_argument('gst_embedding_dim', c['gst'], restricted=True, val_type=int, min_val=0, max_val=1000) - check_argument('gst_use_speaker_embedding', c['gst'], restricted=True, val_type=bool) - check_argument('gst_num_heads', c['gst'], restricted=True, val_type=int, min_val=2, max_val=10) - check_argument('gst_style_tokens', c['gst'], restricted=True, val_type=int, min_val=1, max_val=1000) + check_argument('use_external_speaker_embedding_file', c, restricted=True if c['use_speaker_embedding'] else False, val_type=bool) + check_argument('external_speaker_embedding_file', c, restricted=True if c['use_external_speaker_embedding_file'] else False, val_type=str) + check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool) + if c['use_gst']: + check_argument('gst', c, restricted=is_tacotron(c), val_type=dict) + check_argument('gst_style_input', c['gst'], restricted=is_tacotron(c), val_type=[str, dict]) + check_argument('gst_embedding_dim', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=0, max_val=1000) + check_argument('gst_use_speaker_embedding', c['gst'], restricted=is_tacotron(c), val_type=bool) + check_argument('gst_num_heads', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=2, max_val=10) + check_argument('gst_style_tokens', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=1, max_val=1000) # datasets - checking only the first entry check_argument('datasets', c, restricted=True, val_type=list) From 9270e27cd7df82f5967174da18e0e92967674120 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 10:39:20 +0200 Subject: [PATCH 10/98] add wavernn tests + name refactoring --- tests/inputs/test_vocoder_wavernn_config.json | 94 +++++++++++++++++++ ...tasets.py => test_vocoder_gan_datasets.py} | 0 ...der_train.sh => test_vocoder_gan_train.sh} | 4 +- tests/test_vocoder_wavernn.py | 31 ++++++ tests/test_vocoder_wavernn_datasets.py | 91 ++++++++++++++++++ tests/test_vocoder_wavernn_train.sh | 15 +++ 6 files changed, 233 insertions(+), 2 deletions(-) create mode 100644 tests/inputs/test_vocoder_wavernn_config.json rename tests/{test_vocoder_datasets.py => test_vocoder_gan_datasets.py} (100%) rename tests/{test_vocoder_train.sh => test_vocoder_gan_train.sh} (57%) create mode 100644 tests/test_vocoder_wavernn.py create mode 100644 tests/test_vocoder_wavernn_datasets.py create mode 100755 tests/test_vocoder_wavernn_train.sh diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json new file mode 100644 index 00000000..28c0f059 --- /dev/null +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -0,0 +1,94 @@ +{ + "run_name": "wavernn_test", + "run_description": "wavernn_test training", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length + "seq_len": 1280, // has to be devideable by hop_length + "mode": "mold", // mold [string], gauss [string], bits [int] + "mulaw": false, // apply mulaw if mode is bits + "padding": 2, // pad the input for resnet to see wider input length + + // DATASET + //"use_gta": true, // use computed gta features from the tts model + "data_path": "tests/data/ljspeech/wavs/", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them + + // TRAINING + "batch_size": 4, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "epochs": 1, // total number of epochs to train. + + // VALIDATION + "run_eval": true, + "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) + + // OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [200000, 400000, 600000] + }, + "lr": 1e-4, // initial learning rate + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, // number of samples for testing + + // PATHS + "output_path": "tests/train_outputs/" +} + diff --git a/tests/test_vocoder_datasets.py b/tests/test_vocoder_gan_datasets.py similarity index 100% rename from tests/test_vocoder_datasets.py rename to tests/test_vocoder_gan_datasets.py diff --git a/tests/test_vocoder_train.sh b/tests/test_vocoder_gan_train.sh similarity index 57% rename from tests/test_vocoder_train.sh rename to tests/test_vocoder_gan_train.sh index fa99b4bd..75773cc3 100755 --- a/tests/test_vocoder_train.sh +++ b/tests/test_vocoder_gan_train.sh @@ -5,11 +5,11 @@ echo "$BASEDIR" # create run dir mkdir $BASEDIR/train_outputs # run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json # find the training folder LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) echo $LATEST_FOLDER # continue the previous training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER # remove all the outputs rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER diff --git a/tests/test_vocoder_wavernn.py b/tests/test_vocoder_wavernn.py new file mode 100644 index 00000000..fdb338f9 --- /dev/null +++ b/tests/test_vocoder_wavernn.py @@ -0,0 +1,31 @@ +import numpy as np +import torch +import random +from TTS.vocoder.models.wavernn import WaveRNN + + +def test_wavernn(): + model = WaveRNN( + rnn_dims=512, + fc_dims=512, + mode=10, + mulaw=False, + pad=2, + use_aux_net=True, + use_upsample_net=True, + upsample_factors=[4, 8, 8], + feat_dims=80, + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=256, + sample_rate=22050, + ) + dummy_x = torch.rand((2, 1280)) + dummy_m = torch.rand((2, 80, 9)) + y_size = random.randrange(20, 60) + dummy_y = torch.rand((80, y_size)) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape + output = model.generate(dummy_y, True, 5500, 550, False) + assert np.all(output.shape == (256 * (y_size - 1),)) diff --git a/tests/test_vocoder_wavernn_datasets.py b/tests/test_vocoder_wavernn_datasets.py new file mode 100644 index 00000000..0f4e939a --- /dev/null +++ b/tests/test_vocoder_wavernn_datasets.py @@ -0,0 +1,91 @@ +import os +import shutil + +import numpy as np +from tests import get_tests_path, get_tests_input_path, get_tests_output_path +from torch.utils.data import DataLoader + +from TTS.utils.audio import AudioProcessor +from TTS.utils.io import load_config +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files + +file_path = os.path.dirname(os.path.realpath(__file__)) +OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") +os.makedirs(OUTPATH, exist_ok=True) + +C = load_config(os.path.join(get_tests_input_path(), + "test_vocoder_wavernn_config.json")) + +test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") +test_mel_feat_path = os.path.join(test_data_path, "mel") +test_quant_feat_path = os.path.join(test_data_path, "quant") +ok_ljspeech = os.path.exists(test_data_path) + + +def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): + """ run dataloader with given parameters and check conditions """ + ap = AudioProcessor(**C.audio) + + C.batch_size = batch_size + C.mode = mode + C.seq_len = seq_len + C.data_path = test_data_path + + preprocess_wav_files(test_data_path, C, ap) + _, train_items = load_wav_feat_data( + test_data_path, test_mel_feat_path, 5) + + dataset = WaveRNNDataset(ap=ap, + items=train_items, + seq_len=seq_len, + hop_len=hop_len, + pad=pad, + mode=mode, + ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader(dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=batch_size, + num_workers=num_workers, + pin_memory=True, + ) + + max_iter = 10 + count_iter = 0 + + try: + for data in loader: + x_input, mels, _ = data + expected_feat_shape = (ap.num_mels, + (x_input.shape[-1] // hop_len) + (pad * 2)) + assert np.all( + mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" + + assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] + count_iter += 1 + if count_iter == max_iter: + break + # except AssertionError: + # shutil.rmtree(test_mel_feat_path) + # shutil.rmtree(test_quant_feat_path) + finally: + shutil.rmtree(test_mel_feat_path) + shutil.rmtree(test_quant_feat_path) + + +def test_parametrized_wavernn_dataset(): + ''' test dataloader with different parameters ''' + params = [ + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", 4], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", 0], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, 2], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", 0], + ] + for param in params: + print(param) + wavernn_dataset_case(*param) diff --git a/tests/test_vocoder_wavernn_train.sh b/tests/test_vocoder_wavernn_train.sh new file mode 100755 index 00000000..f2e32116 --- /dev/null +++ b/tests/test_vocoder_wavernn_train.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# create run dir +mkdir $BASEDIR/train_outputs +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER \ No newline at end of file From 6245dd2b93a1c58215fc73c96274ca99c02ccf33 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 10:44:00 +0200 Subject: [PATCH 11/98] added to device cpu/gpu + formatting --- TTS/bin/train_wavernn_vocoder.py | 182 ++++++++++++------------ TTS/vocoder/datasets/wavernn_dataset.py | 34 ++--- TTS/vocoder/models/wavernn.py | 66 +++++---- 3 files changed, 145 insertions(+), 137 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 78984510..66a7c913 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -44,43 +44,41 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not CONFIG.run_eval: loader = None else: - dataset = WaveRNNDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=CONFIG.seq_len, - hop_len=ap.hop_length, - pad=CONFIG.padding, - mode=CONFIG.mode, - is_training=not is_val, - verbose=verbose, - ) + dataset = WaveRNNDataset(ap=ap, + items=eval_data if is_val else train_data, + seq_len=CONFIG.seq_len, + hop_len=ap.hop_length, + pad=CONFIG.padding, + mode=CONFIG.mode, + is_training=not is_val, + verbose=verbose, + ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - shuffle=True, - collate_fn=dataset.collate, - batch_size=CONFIG.batch_size, - num_workers=CONFIG.num_val_loader_workers - if is_val - else CONFIG.num_loader_workers, - pin_memory=True, - ) + loader = DataLoader(dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=CONFIG.batch_size, + num_workers=CONFIG.num_val_loader_workers + if is_val + else CONFIG.num_loader_workers, + pin_memory=True, + ) return loader def format_data(data): # setup input data - x = data[0] - m = data[1] - y = data[2] + x_input = data[0] + mels = data[1] + y_coarse = data[2] # dispatch data to GPU if use_cuda: - x = x.cuda(non_blocking=True) - m = m.cuda(non_blocking=True) - y = y.cuda(non_blocking=True) + x_input = x_input.cuda(non_blocking=True) + mels = mels.cuda(non_blocking=True) + y_coarse = y_coarse.cuda(non_blocking=True) - return x, m, y + return x_input, mels, y_coarse def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): @@ -90,7 +88,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus)) + batch_n_iter = int(len(data_loader.dataset) / + (CONFIG.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) end_time = time.time() @@ -99,30 +98,31 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): print(" > Training", flush=True) for num_iter, data in enumerate(data_loader): start_time = time.time() - x, m, y = format_data(data) + x_input, mels, y_coarse = format_data(data) loader_time = time.time() - end_time global_step += 1 ################## # MODEL TRAINING # ################## - y_hat = model(x, m) + y_hat = model(x_input, mels) if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: - y = y.float() - y = y.unsqueeze(-1) + y_coarse = y_coarse.float() + y_coarse = y_coarse.unsqueeze(-1) # m_scaled, _ = model.upsample(m) # compute losses - loss = criterion(y_hat, y) + loss = criterion(y_hat, y_coarse) if loss.item() is None: raise RuntimeError(" [!] None loss. Exiting ...") optimizer.zero_grad() loss.backward() if CONFIG.grad_clip > 0: - torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip) + torch.nn.utils.clip_grad_norm_( + model.parameters(), CONFIG.grad_clip) optimizer.step() if scheduler is not None: @@ -145,19 +145,17 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): # print training stats if global_step % CONFIG.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr": cur_lr, - } - c_logger.print_train_step( - batch_n_iter, - num_iter, - global_step, - log_dict, - loss_dict, - keep_avg.avg_values, - ) + log_dict = {"step_time": [step_time, 2], + "loader_time": [loader_time, 4], + "current_lr": cur_lr, + } + c_logger.print_train_step(batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) # plot step stats if global_step % 10 == 0: @@ -169,40 +167,38 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): if global_step % CONFIG.save_step == 0: if CONFIG.checkpoint: # save model - save_checkpoint( - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - ) + save_checkpoint(model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) # synthesize a full voice wav_path = train_data[random.randrange(0, len(train_data))][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.generate( - ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, - ) + sample_wav = model.generate(ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms - figures = { - "train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), - } + figures = {"train/ground_truth": plot_spectrogram(ground_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T), + } # Sample audio tb_logger.tb_train_audios( - global_step, {"train/audio": sample_wav}, CONFIG.audio["sample_rate"] + global_step, { + "train/audio": sample_wav}, CONFIG.audio["sample_rate"] ) tb_logger.tb_train_figures(global_step, figures) @@ -234,17 +230,17 @@ def evaluate(model, criterion, ap, global_step, epoch): for num_iter, data in enumerate(data_loader): start_time = time.time() # format data - x, m, y = format_data(data) + x_input, mels, y_coarse = format_data(data) loader_time = time.time() - end_time global_step += 1 - y_hat = model(x, m) + y_hat = model(x_input, mels) if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: - y = y.float() - y = y.unsqueeze(-1) - loss = criterion(y_hat, y) + y_coarse = y_coarse.float() + y_coarse = y_coarse.unsqueeze(-1) + loss = criterion(y_hat, y_coarse) # Compute avg loss # if num_gpus > 1: # loss = reduce_tensor(loss.data, num_gpus) @@ -264,30 +260,31 @@ def evaluate(model, criterion, ap, global_step, epoch): # print eval stats if CONFIG.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) + c_logger.print_eval_step( + num_iter, loss_dict, keep_avg.avg_values) - if epoch % CONFIG.test_every_epochs == 0: + if epoch % CONFIG.test_every_epochs == 0 and epoch != 0: # synthesize a part of data wav_path = eval_data[random.randrange(0, len(eval_data))][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav[:22000]) - sample_wav = model.generate( - ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, - ) + sample_wav = model.generate(ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + use_cuda + ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms - figures = { - "eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } + figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T), + } # Sample audio tb_logger.tb_eval_audios( - global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + global_step, { + "eval/audio": sample_wav}, CONFIG.audio["sample_rate"] ) tb_logger.tb_eval_figures(global_step, figures) @@ -372,7 +369,8 @@ def main(args): # pylint: disable=redefined-outer-name model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) model_wavernn.load_state_dict(model_dict) - print(" > Model restored from step %d" % checkpoint["step"], flush=True) + print(" > Model restored from step %d" % + checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 @@ -393,7 +391,8 @@ def main(args): # pylint: disable=redefined-outer-name _, global_step = train( model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch ) - eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate( + model_wavernn, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict["avg_model_loss"] best_loss = save_best_model( @@ -493,7 +492,8 @@ if __name__ == "__main__": tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0) + tb_logger.tb_add_text("model-description", + CONFIG["run_description"], 0) try: main(args) diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 5d5b9f15..194344a9 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -8,17 +8,16 @@ class WaveRNNDataset(Dataset): WaveRNN Dataset searchs for all the wav files under root path. """ - def __init__( - self, - ap, - items, - seq_len, - hop_len, - pad, - mode, - is_training=True, - verbose=False, - ): + def __init__(self, + ap, + items, + seq_len, + hop_len, + pad, + mode, + is_training=True, + verbose=False, + ): self.ap = ap self.item_list = items @@ -56,17 +55,19 @@ class WaveRNNDataset(Dataset): def collate(self, batch): mel_win = self.seq_len // self.hop_len + 2 * self.pad - max_offsets = [x[0].shape[-1] - (mel_win + 2 * self.pad) for x in batch] + max_offsets = [x[0].shape[-1] - + (mel_win + 2 * self.pad) for x in batch] mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] - sig_offsets = [(offset + self.pad) * self.hop_len for offset in mel_offsets] + sig_offsets = [(offset + self.pad) * + self.hop_len for offset in mel_offsets] mels = [ - x[0][:, mel_offsets[i] : mel_offsets[i] + mel_win] + x[0][:, mel_offsets[i]: mel_offsets[i] + mel_win] for i, x in enumerate(batch) ] coarse = [ - x[1][sig_offsets[i] : sig_offsets[i] + self.seq_len + 1] + x[1][sig_offsets[i]: sig_offsets[i] + self.seq_len + 1] for i, x in enumerate(batch) ] @@ -79,7 +80,8 @@ class WaveRNNDataset(Dataset): coarse = np.stack(coarse).astype(np.int64) coarse = torch.LongTensor(coarse) x_input = ( - 2 * coarse[:, : self.seq_len].float() / (2 ** self.mode - 1.0) - 1.0 + 2 * coarse[:, : self.seq_len].float() / + (2 ** self.mode - 1.0) - 1.0 ) y_coarse = coarse[:, 1:] mels = torch.FloatTensor(mels) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 4d1a633c..9b151cac 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -39,7 +39,8 @@ class MelResNet(nn.Module): def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): super().__init__() k_size = pad * 2 + 1 - self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False) + self.conv_in = nn.Conv1d( + in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() for _ in range(res_blocks): @@ -94,7 +95,8 @@ class UpsampleNetwork(nn.Module): k_size = (1, scale * 2 + 1) padding = (0, scale) stretch = Stretch2d(scale, 1) - conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False) + conv = nn.Conv2d(1, 1, kernel_size=k_size, + padding=padding, bias=False) conv.weight.data.fill_(1.0 / k_size[1]) self.up_layers.append(stretch) self.up_layers.append(conv) @@ -110,7 +112,7 @@ class UpsampleNetwork(nn.Module): m = m.unsqueeze(1) for f in self.up_layers: m = f(m) - m = m.squeeze(1)[:, :, self.indent : -self.indent] + m = m.squeeze(1)[:, :, self.indent: -self.indent] return m.transpose(1, 2), aux @@ -123,7 +125,8 @@ class Upsample(nn.Module): self.pad = pad self.indent = pad * scale self.use_aux_net = use_aux_net - self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad) + self.resnet = MelResNet(res_blocks, feat_dims, + compute_dims, res_out_dims, pad) def forward(self, m): if self.use_aux_net: @@ -137,7 +140,7 @@ class Upsample(nn.Module): m = torch.nn.functional.interpolate( m, scale_factor=self.scale, mode="linear", align_corners=True ) - m = m[:, :, self.indent : -self.indent] + m = m[:, :, self.indent: -self.indent] m = m * 0.045 # empirically found return m.transpose(1, 2), aux @@ -207,7 +210,8 @@ class WaveRNN(nn.Module): if self.use_aux_net: self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims) self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, + rnn_dims, batch_first=True) self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) self.fc3 = nn.Linear(fc_dims, self.n_classes) @@ -221,16 +225,16 @@ class WaveRNN(nn.Module): def forward(self, x, mels): bsize = x.size(0) - h1 = torch.zeros(1, bsize, self.rnn_dims).cuda() - h2 = torch.zeros(1, bsize, self.rnn_dims).cuda() + h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) + h2 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) mels, aux = self.upsample(mels) if self.use_aux_net: aux_idx = [self.aux_dims * i for i in range(5)] - a1 = aux[:, :, aux_idx[0] : aux_idx[1]] - a2 = aux[:, :, aux_idx[1] : aux_idx[2]] - a3 = aux[:, :, aux_idx[2] : aux_idx[3]] - a4 = aux[:, :, aux_idx[3] : aux_idx[4]] + a1 = aux[:, :, aux_idx[0]: aux_idx[1]] + a2 = aux[:, :, aux_idx[1]: aux_idx[2]] + a3 = aux[:, :, aux_idx[2]: aux_idx[3]] + a4 = aux[:, :, aux_idx[3]: aux_idx[4]] x = ( torch.cat([x.unsqueeze(-1), mels, a1], dim=2) @@ -256,19 +260,21 @@ class WaveRNN(nn.Module): x = F.relu(self.fc2(x)) return self.fc3(x) - def generate(self, mels, batched, target, overlap): + def generate(self, mels, batched, target, overlap, use_cuda): self.eval() + device = 'cuda' if use_cuda else 'cpu' output = [] start = time.time() rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad(): - - mels = torch.FloatTensor(mels).cuda().unsqueeze(0) + mels = torch.FloatTensor(mels).unsqueeze(0).to(device) + #mels = torch.FloatTensor(mels).cuda().unsqueeze(0) wave_len = (mels.size(-1) - 1) * self.hop_length - mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both") + mels = self.pad_tensor(mels.transpose( + 1, 2), pad=self.pad, side="both") mels, aux = self.upsample(mels.transpose(1, 2)) if batched: @@ -278,13 +284,13 @@ class WaveRNN(nn.Module): b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).cuda() - h2 = torch.zeros(b_size, self.rnn_dims).cuda() - x = torch.zeros(b_size, 1).cuda() + h1 = torch.zeros(b_size, self.rnn_dims).to(device) + h2 = torch.zeros(b_size, self.rnn_dims).to(device) + x = torch.zeros(b_size, 1).to(device) if self.use_aux_net: d = self.aux_dims - aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] + aux_split = [aux[:, :, d * i: d * (i + 1)] for i in range(4)] for i in range(seq_len): @@ -319,11 +325,12 @@ class WaveRNN(nn.Module): logits.unsqueeze(0).transpose(1, 2) ) output.append(sample.view(-1)) - x = sample.transpose(0, 1).cuda() + x = sample.transpose(0, 1).to(device) elif self.mode == "gauss": - sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) + sample = sample_from_gaussian( + logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) - x = sample.transpose(0, 1).cuda() + x = sample.transpose(0, 1).to(device) elif isinstance(self.mode, int): posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) @@ -332,7 +339,8 @@ class WaveRNN(nn.Module): output.append(sample) x = sample.unsqueeze(-1) else: - raise RuntimeError("Unknown model mode value - ", self.mode) + raise RuntimeError( + "Unknown model mode value - ", self.mode) if i % 100 == 0: self.gen_display(i, seq_len, b_size, start) @@ -352,7 +360,7 @@ class WaveRNN(nn.Module): # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] - output[-20 * self.hop_length :] *= fade_out + output[-20 * self.hop_length:] *= fade_out self.train() return output @@ -366,7 +374,6 @@ class WaveRNN(nn.Module): ) def fold_with_overlap(self, x, target, overlap): - """Fold the tensor with overlap for quick batched inference. Overlap will be used for crossfading in xfade_and_unfold() Args: @@ -398,7 +405,7 @@ class WaveRNN(nn.Module): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side="after") - folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + folded = torch.zeros(num_folds, target + 2 * overlap, features).to(x.device) # Get the values for the folded tensor for i in range(num_folds): @@ -423,16 +430,15 @@ class WaveRNN(nn.Module): # i.e., it won't generalise to other shapes/dims b, t, c = x.size() total = t + 2 * pad if side == "both" else t + pad - padded = torch.zeros(b, total, c).cuda() + padded = torch.zeros(b, total, c).to(x.device) if side in ("before", "both"): - padded[:, pad : pad + t, :] = x + padded[:, pad: pad + t, :] = x elif side == "after": padded[:, :t, :] = x return padded @staticmethod def xfade_and_unfold(y, target, overlap): - """Applies a crossfade and unfolds into a 1d array. Args: y (ndarry) : Batched sequences of audio samples From 4d5da4b663d7a2210a9fe4965ab942ad7557efb0 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 13:22:50 +0200 Subject: [PATCH 12/98] fix travis + pylint tests --- .travis/script | 3 ++- TTS/vocoder/models/wavernn.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis/script b/.travis/script index 0c24a221..0860f9cf 100755 --- a/.travis/script +++ b/.travis/script @@ -17,5 +17,6 @@ fi if [[ "$TEST_SUITE" == "testscripts" ]]; then # test model training scripts ./tests/test_tts_train.sh - ./tests/test_vocoder_train.sh + ./tests/test_vocoder_gan_train.sh + ./tests/test_vocoder_wavernn_train.sh fi diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 9b151cac..8a45d9e3 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -225,7 +225,7 @@ class WaveRNN(nn.Module): def forward(self, x, mels): bsize = x.size(0) - h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) + h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) h2 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) mels, aux = self.upsample(mels) From 07345099ee8ba3b2ca6aa412d53615273177845e Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 24 Oct 2020 15:58:39 -0300 Subject: [PATCH 13/98] GlowTTS zero-shot TTS Support --- TTS/bin/train_glow_tts.py | 104 ++++++++++++++++++++++----------- TTS/tts/models/glow_tts.py | 28 ++++++--- TTS/tts/utils/generic_utils.py | 3 +- TTS/tts/utils/synthesis.py | 2 +- 4 files changed, 91 insertions(+), 46 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index c5e570e5..d924b906 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -9,6 +9,7 @@ import time import traceback import torch +from random import randrange from torch.utils.data import DataLoader from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset @@ -36,8 +37,7 @@ from TTS.utils.training import (NoamLR, check_update, use_cuda, num_gpus = setup_torch_training_env(True, False) -def setup_loader(ap, r, is_val=False, verbose=False): - +def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None): if is_val and not c.run_eval: loader = None else: @@ -56,7 +56,8 @@ def setup_loader(ap, r, is_val=False, verbose=False): use_phonemes=c.use_phonemes, phoneme_language=c.phoneme_language, enable_eos_bos=c.enable_eos_bos_chars, - verbose=verbose) + verbose=verbose, + speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None) sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader( dataset, @@ -86,10 +87,13 @@ def format_data(data): avg_spec_length = torch.mean(mel_lengths.float()) if c.use_speaker_embedding: - speaker_ids = [ - speaker_mapping[speaker_name] for speaker_name in speaker_names - ] - speaker_ids = torch.LongTensor(speaker_ids) + if c.use_external_speaker_embedding_file: + speaker_ids = data[8] + else: + speaker_ids = [ + speaker_mapping[speaker_name] for speaker_name in speaker_names + ] + speaker_ids = torch.LongTensor(speaker_ids) else: speaker_ids = None @@ -107,7 +111,7 @@ def format_data(data): avg_text_length, avg_spec_length, attn_mask -def data_depended_init(model, ap): +def data_depended_init(model, ap, speaker_mapping=None): """Data depended initialization for activation normalization.""" if hasattr(model, 'module'): for f in model.module.decoder.flows: @@ -118,19 +122,19 @@ def data_depended_init(model, ap): if getattr(f, "set_ddi", False): f.set_ddi(True) - data_loader = setup_loader(ap, 1, is_val=False) + data_loader = setup_loader(ap, 1, is_val=False, speaker_mapping=speaker_mapping) model.train() print(" > Data depended initialization ... ") with torch.no_grad(): for _, data in enumerate(data_loader): # format data - text_input, text_lengths, mel_input, mel_lengths, _,\ + text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\ _, _, attn_mask = format_data(data) # forward pass model _ = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask) + text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids) break if hasattr(model, 'module'): @@ -145,9 +149,9 @@ def data_depended_init(model, ap): def train(model, criterion, optimizer, scheduler, - ap, global_step, epoch, amp): + ap, global_step, epoch, amp, speaker_mapping=None): data_loader = setup_loader(ap, 1, is_val=False, - verbose=(epoch == 0)) + verbose=(epoch == 0), speaker_mapping=speaker_mapping) model.train() epoch_time = 0 keep_avg = KeepAverage() @@ -162,7 +166,7 @@ def train(model, criterion, optimizer, scheduler, start_time = time.time() # format data - text_input, text_lengths, mel_input, mel_lengths, _,\ + text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\ avg_text_length, avg_spec_length, attn_mask = format_data(data) loader_time = time.time() - end_time @@ -176,7 +180,7 @@ def train(model, criterion, optimizer, scheduler, # forward pass model z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask) + text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids) # compute loss loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, @@ -262,7 +266,7 @@ def train(model, criterion, optimizer, scheduler, # Diagnostic visualizations # direct pass on model for spec predictions - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1]) + spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=speaker_ids[:1]) spec_pred = spec_pred.permute(0, 2, 1) gt_spec = mel_input.permute(0, 2, 1) const_spec = spec_pred[0].data.cpu().numpy() @@ -298,8 +302,8 @@ def train(model, criterion, optimizer, scheduler, @torch.no_grad() -def evaluate(model, criterion, ap, global_step, epoch): - data_loader = setup_loader(ap, 1, is_val=True) +def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping): + data_loader = setup_loader(ap, 1, is_val=True, speaker_mapping=speaker_mapping) model.eval() epoch_time = 0 keep_avg = KeepAverage() @@ -309,12 +313,12 @@ def evaluate(model, criterion, ap, global_step, epoch): start_time = time.time() # format data - text_input, text_lengths, mel_input, mel_lengths, _,\ + text_input, text_lengths, mel_input, mel_lengths, speaker_ids,\ _, _, attn_mask = format_data(data) # forward pass model z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask) + text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids) # compute loss loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, @@ -356,9 +360,9 @@ def evaluate(model, criterion, ap, global_step, epoch): # Diagnostic visualizations # direct pass on model for spec predictions if hasattr(model, 'module'): - spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1]) + spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=speaker_ids[:1]) else: - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1]) + spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=speaker_ids[:1]) spec_pred = spec_pred.permute(0, 2, 1) gt_spec = mel_input.permute(0, 2, 1) @@ -398,7 +402,17 @@ def evaluate(model, criterion, ap, global_step, epoch): test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") - speaker_id = 0 if c.use_speaker_embedding else None + if c.use_speaker_embedding: + if c.use_external_speaker_embedding_file: + speaker_embedding = speaker_mapping[list(speaker_mapping.keys())[randrange(len(speaker_mapping)-1)]]['embedding'] + speaker_id = None + else: + speaker_id = 0 + speaker_embedding = None + else: + speaker_id = None + speaker_embedding = None + style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: @@ -409,6 +423,7 @@ def evaluate(model, criterion, ap, global_step, epoch): use_cuda, ap, speaker_id=speaker_id, + speaker_embedding=speaker_embedding, style_wav=style_wav, truncated=False, enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument @@ -462,23 +477,42 @@ def main(args): # pylint: disable=redefined-outer-name if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - assert all([speaker in speaker_mapping - for speaker in speakers]), "As of now you, you cannot " \ - "introduce new speakers to " \ - "a previously trained model." - else: + if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file + prev_out_path = os.path.dirname(args.restore_path) + speaker_mapping = load_speaker_mapping(prev_out_path) + if not speaker_mapping: + print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file") + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + if not speaker_mapping: + raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file") + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) + elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file + prev_out_path = os.path.dirname(args.restore_path) + speaker_mapping = load_speaker_mapping(prev_out_path) + speaker_embedding_dim = None + assert all([speaker in speaker_mapping + for speaker in speakers]), "As of now you, you cannot " \ + "introduce new speakers to " \ + "a previously trained model." + elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) + elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file + raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" + else: # if start new train and don't use External Embedding file speaker_mapping = {name: i for i, name in enumerate(speakers)} + speaker_embedding_dim = None save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) - print("Training with {} speakers: {}".format(num_speakers, + print("Training with {} speakers: {}".format(len(speakers), ", ".join(speakers))) else: num_speakers = 0 + speaker_embedding_dim = None + speaker_mapping = None # setup model - model = setup_model(num_chars, num_speakers, c) + model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim) optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = GlowTTSLoss() @@ -540,13 +574,13 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float('inf') global_step = args.restore_step - model = data_depended_init(model, ap) + model = data_depended_init(model, ap, speaker_mapping) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(model, criterion, optimizer, scheduler, ap, global_step, - epoch, amp) - eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) + epoch, amp, speaker_mapping) + eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=speaker_mapping) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index a9b6f8c0..dec8243a 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -68,13 +68,14 @@ class GlowTts(nn.Module): self.use_encoder_prenet = use_encoder_prenet self.noise_scale = 0.66 self.length_scale = 1. + self.external_speaker_embedding_dim = external_speaker_embedding_dim # if is a multispeaker and c_in_channels is 0, set to 256 if num_speakers > 1: - if self.c_in_channels == 0 and not external_speaker_embedding_dim: - self.c_in_channels = 256 - elif external_speaker_embedding_dim: - self.c_in_channels = external_speaker_embedding_dim + if self.c_in_channels == 0 and not self.external_speaker_embedding_dim: + self.c_in_channels = 512 + elif self.external_speaker_embedding_dim: + self.c_in_channels = self.external_speaker_embedding_dim self.encoder = Encoder(num_chars, out_channels=out_channels, @@ -88,7 +89,7 @@ class GlowTts(nn.Module): dropout_p=dropout_p, mean_only=mean_only, use_prenet=use_encoder_prenet, - c_in_channels=c_in_channels) + c_in_channels=self.c_in_channels) self.decoder = Decoder(out_channels, hidden_channels_dec or hidden_channels, @@ -100,10 +101,10 @@ class GlowTts(nn.Module): num_splits=num_splits, num_sqz=num_sqz, sigmoid_scale=sigmoid_scale, - c_in_channels=c_in_channels) + c_in_channels=self.c_in_channels) if num_speakers > 1 and not external_speaker_embedding_dim: - self.emb_g = nn.Embedding(num_speakers, c_in_channels) + self.emb_g = nn.Embedding(num_speakers, self.c_in_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @staticmethod @@ -130,7 +131,11 @@ class GlowTts(nn.Module): y_max_length = y.size(2) # norm speaker embeddings if g is not None: - g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] + if self.external_speaker_embedding_dim: + g = F.normalize(g).unsqueeze(-1) + else: + g = F.normalize(self.emb_g(g)).unsqueeze(-1)# [b, h] + # embedding pass o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, @@ -165,8 +170,13 @@ class GlowTts(nn.Module): @torch.no_grad() def inference(self, x, x_lengths, g=None): + if g is not None: - g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] + if self.external_speaker_embedding_dim: + g = F.normalize(g).unsqueeze(-1) + else: + g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] + # embedding pass o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index aacac898..2361fa85 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -126,7 +126,8 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): mean_only=True, hidden_channels_enc=192, hidden_channels_dec=192, - use_encoder_prenet=True) + use_encoder_prenet=True, + external_speaker_embedding_dim=speaker_embedding_dim) return model def is_tacotron(c): diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index f810e213..0dfea5cc 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -59,7 +59,7 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel inputs, speaker_ids=speaker_id, speaker_embeddings=speaker_embeddings) elif 'glow' in CONFIG.model.lower(): inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable - postnet_output, _, _, _, alignments, _, _ = model.inference(inputs, inputs_lengths) + postnet_output, _, _, _, alignments, _, _ = model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id else speaker_embeddings) postnet_output = postnet_output.permute(0, 2, 1) # these only belong to tacotron models. decoder_output = None From fbea058c596e4a0b1d0c21c68438d4ce6d85ae60 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 24 Oct 2020 16:10:05 -0300 Subject: [PATCH 14/98] add parse speakers function --- TTS/bin/train_glow_tts.py | 40 ++------------------------------------ TTS/bin/train_tts.py | 40 ++------------------------------------ TTS/tts/utils/speakers.py | 41 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 76 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index d924b906..7ffca36e 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -19,8 +19,7 @@ from TTS.tts.utils.distribute import (DistributedSampler, init_distributed, from TTS.tts.utils.generic_utils import setup_model, check_config_tts from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import (get_speakers, load_speaker_mapping, - save_speaker_mapping) +from TTS.tts.utils.speakers import parse_speakers, load_speaker_mapping from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -474,42 +473,7 @@ def main(args): # pylint: disable=redefined-outer-name meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)] # parse speakers - if c.use_speaker_embedding: - speakers = get_speakers(meta_data_train) - if args.restore_path: - if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - if not speaker_mapping: - print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file") - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - if not speaker_mapping: - raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file") - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) - elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - speaker_embedding_dim = None - assert all([speaker in speaker_mapping - for speaker in speakers]), "As of now you, you cannot " \ - "introduce new speakers to " \ - "a previously trained model." - elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) - elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file - raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" - else: # if start new train and don't use External Embedding file - speaker_mapping = {name: i for i, name in enumerate(speakers)} - speaker_embedding_dim = None - save_speaker_mapping(OUT_PATH, speaker_mapping) - num_speakers = len(speaker_mapping) - print("Training with {} speakers: {}".format(len(speakers), - ", ".join(speakers))) - else: - num_speakers = 0 - speaker_embedding_dim = None - speaker_mapping = None + num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, OUT_PATH) # setup model model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim=speaker_embedding_dim) diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 88e10aea..4c615b99 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -22,8 +22,7 @@ from TTS.tts.utils.distribute import (DistributedSampler, from TTS.tts.utils.generic_utils import setup_model, check_config_tts from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import (get_speakers, load_speaker_mapping, - save_speaker_mapping) +from TTS.tts.utils.speakers import parse_speakers, load_speaker_mapping from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -502,42 +501,7 @@ def main(args): # pylint: disable=redefined-outer-name meta_data_eval = meta_data_eval[:int(len(meta_data_eval) * c.eval_portion)] # parse speakers - if c.use_speaker_embedding: - speakers = get_speakers(meta_data_train) - if args.restore_path: - if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - if not speaker_mapping: - print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file") - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - if not speaker_mapping: - raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file") - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) - elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file - prev_out_path = os.path.dirname(args.restore_path) - speaker_mapping = load_speaker_mapping(prev_out_path) - speaker_embedding_dim = None - assert all([speaker in speaker_mapping - for speaker in speakers]), "As of now you, you cannot " \ - "introduce new speakers to " \ - "a previously trained model." - elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file - speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) - speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) - elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file - raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" - else: # if start new train and don't use External Embedding file - speaker_mapping = {name: i for i, name in enumerate(speakers)} - speaker_embedding_dim = None - save_speaker_mapping(OUT_PATH, speaker_mapping) - num_speakers = len(speaker_mapping) - print("Training with {} speakers: {}".format(num_speakers, - ", ".join(speakers))) - else: - num_speakers = 0 - speaker_embedding_dim = None - speaker_mapping = None + num_speakers, speaker_embedding_dim, speaker_mapping = parse_speakers(c, args, meta_data_train, OUT_PATH) model = setup_model(num_chars, num_speakers, c, speaker_embedding_dim) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 156e42af..d507ff3d 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -30,3 +30,44 @@ def get_speakers(items): """Returns a sorted, unique list of speakers in a given dataset.""" speakers = {e[2] for e in items} return sorted(speakers) + +def parse_speakers(c, args, meta_data_train, OUT_PATH): + """ Returns number of speakers, speaker embedding shape and speaker mapping""" + if c.use_speaker_embedding: + speakers = get_speakers(meta_data_train) + if args.restore_path: + if c.use_external_speaker_embedding_file: # if restore checkpoint and use External Embedding file + prev_out_path = os.path.dirname(args.restore_path) + speaker_mapping = load_speaker_mapping(prev_out_path) + if not speaker_mapping: + print("WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file") + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + if not speaker_mapping: + raise RuntimeError("You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file") + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) + elif not c.use_external_speaker_embedding_file: # if restore checkpoint and don't use External Embedding file + prev_out_path = os.path.dirname(args.restore_path) + speaker_mapping = load_speaker_mapping(prev_out_path) + speaker_embedding_dim = None + assert all([speaker in speaker_mapping + for speaker in speakers]), "As of now you, you cannot " \ + "introduce new speakers to " \ + "a previously trained model." + elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # if start new train using External Embedding file + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]['embedding']) + elif c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file: # if start new train using External Embedding file and don't pass external embedding file + raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" + else: # if start new train and don't use External Embedding file + speaker_mapping = {name: i for i, name in enumerate(speakers)} + speaker_embedding_dim = None + save_speaker_mapping(OUT_PATH, speaker_mapping) + num_speakers = len(speaker_mapping) + print("Training with {} speakers: {}".format(len(speakers), + ", ".join(speakers))) + else: + num_speakers = 0 + speaker_embedding_dim = None + speaker_mapping = None + + return num_speakers, speaker_embedding_dim, speaker_mapping \ No newline at end of file From 4a989e3cebf68ef9ae2ab4f675fcfbbeb983288a Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 09:45:37 +0100 Subject: [PATCH 15/98] compute audio feat on dataload --- TTS/bin/train_wavernn_vocoder.py | 175 ++++++++++++------------ TTS/vocoder/configs/wavernn_config.json | 143 +++++++++---------- TTS/vocoder/datasets/wavernn_dataset.py | 68 ++++++--- TTS/vocoder/models/wavernn.py | 60 ++++---- 4 files changed, 243 insertions(+), 203 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 66a7c913..91a62cbe 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -29,8 +29,8 @@ from TTS.utils.generic_utils import ( from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( find_feat_files, - load_wav_feat_data, - preprocess_wav_files, + load_wav_data, + load_wav_feat_data ) from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss from TTS.vocoder.utils.generic_utils import setup_wavernn @@ -41,15 +41,16 @@ use_cuda, num_gpus = setup_torch_training_env(True, True) def setup_loader(ap, is_val=False, verbose=False): - if is_val and not CONFIG.run_eval: + if is_val and not c.run_eval: loader = None else: dataset = WaveRNNDataset(ap=ap, items=eval_data if is_val else train_data, - seq_len=CONFIG.seq_len, + seq_len=c.seq_len, hop_len=ap.hop_length, - pad=CONFIG.padding, - mode=CONFIG.mode, + pad=c.padding, + mode=c.mode, + mulaw=c.mulaw, is_training=not is_val, verbose=verbose, ) @@ -57,10 +58,10 @@ def setup_loader(ap, is_val=False, verbose=False): loader = DataLoader(dataset, shuffle=True, collate_fn=dataset.collate, - batch_size=CONFIG.batch_size, - num_workers=CONFIG.num_val_loader_workers + batch_size=c.batch_size, + num_workers=c.num_val_loader_workers if is_val - else CONFIG.num_loader_workers, + else c.num_loader_workers, pin_memory=True, ) return loader @@ -89,9 +90,9 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): keep_avg = KeepAverage() if use_cuda: batch_n_iter = int(len(data_loader.dataset) / - (CONFIG.batch_size * num_gpus)) + (c.batch_size * num_gpus)) else: - batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) + batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() c_logger.print_train_start() # train loop @@ -102,9 +103,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): loader_time = time.time() - end_time global_step += 1 - ################## - # MODEL TRAINING # - ################## y_hat = model(x_input, mels) if isinstance(model.mode, int): @@ -112,7 +110,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): else: y_coarse = y_coarse.float() y_coarse = y_coarse.unsqueeze(-1) - # m_scaled, _ = model.upsample(m) # compute losses loss = criterion(y_hat, y_coarse) @@ -120,11 +117,11 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): raise RuntimeError(" [!] None loss. Exiting ...") optimizer.zero_grad() loss.backward() - if CONFIG.grad_clip > 0: + if c.grad_clip > 0: torch.nn.utils.clip_grad_norm_( - model.parameters(), CONFIG.grad_clip) - + model.parameters(), c.grad_clip) optimizer.step() + if scheduler is not None: scheduler.step() @@ -144,7 +141,7 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): keep_avg.update_values(update_train_values) # print training stats - if global_step % CONFIG.print_step == 0: + if global_step % c.print_step == 0: log_dict = {"step_time": [step_time, 2], "loader_time": [loader_time, 4], "current_lr": cur_lr, @@ -164,8 +161,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): tb_logger.tb_train_iter_stats(global_step, iter_stats) # save checkpoint - if global_step % CONFIG.save_step == 0: - if CONFIG.checkpoint: + if global_step % c.save_step == 0: + if c.checkpoint: # save model save_checkpoint(model, optimizer, @@ -180,28 +177,30 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) # synthesize a full voice - wav_path = train_data[random.randrange(0, len(train_data))][0] + rand_idx = random.randrange(0, len(train_data)) + wav_path = train_data[rand_idx] if not isinstance( + train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) sample_wav = model.generate(ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, + c.batched, + c.target_samples, + c.overlap_samples, + use_cuda ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms figures = {"train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T) } + tb_logger.tb_train_figures(global_step, figures) # Sample audio tb_logger.tb_train_audios( global_step, { - "train/audio": sample_wav}, CONFIG.audio["sample_rate"] + "train/audio": sample_wav}, c.audio["sample_rate"] ) - - tb_logger.tb_train_figures(global_step, figures) end_time = time.time() # print epoch stats @@ -259,34 +258,35 @@ def evaluate(model, criterion, ap, global_step, epoch): keep_avg.update_values(update_eval_values) # print eval stats - if CONFIG.print_eval: + if c.print_eval: c_logger.print_eval_step( num_iter, loss_dict, keep_avg.avg_values) - if epoch % CONFIG.test_every_epochs == 0 and epoch != 0: - # synthesize a part of data - wav_path = eval_data[random.randrange(0, len(eval_data))][0] + if epoch % c.test_every_epochs == 0 and epoch != 0: + # synthesize a full voice + rand_idx = random.randrange(0, len(eval_data)) + wav_path = eval_data[rand_idx] if not isinstance( + eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav[:22000]) + ground_mel = ap.melspectrogram(wav) sample_wav = model.generate(ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, + c.batched, + c.target_samples, + c.overlap_samples, use_cuda ) predict_mel = ap.melspectrogram(sample_wav) - # compute spectrograms - figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } - # Sample audio tb_logger.tb_eval_audios( global_step, { - "eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + "eval/audio": sample_wav}, c.audio["sample_rate"] ) + # compute spectrograms + figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T) + } tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -299,53 +299,62 @@ def main(args): # pylint: disable=redefined-outer-name global train_data, eval_data # setup audio processor - ap = AudioProcessor(**CONFIG.audio) + ap = AudioProcessor(**c.audio) - print(f" > Loading wavs from: {CONFIG.data_path}") - if CONFIG.feature_path is not None: - print(f" > Loading features from: {CONFIG.feature_path}") + # print(f" > Loading wavs from: {c.data_path}") + # if c.feature_path is not None: + # print(f" > Loading features from: {c.feature_path}") + # eval_data, train_data = load_wav_feat_data( + # c.data_path, c.feature_path, c.eval_split_size + # ) + # else: + # mel_feat_path = os.path.join(OUT_PATH, "mel") + # feat_data = find_feat_files(mel_feat_path) + # if feat_data: + # print(f" > Loading features from: {mel_feat_path}") + # eval_data, train_data = load_wav_feat_data( + # c.data_path, mel_feat_path, c.eval_split_size + # ) + # else: + # print(" > No feature data found. Preprocessing...") + # # preprocessing feature data from given wav files + # preprocess_wav_files(OUT_PATH, CONFIG, ap) + # eval_data, train_data = load_wav_feat_data( + # c.data_path, mel_feat_path, c.eval_split_size + # ) + + print(f" > Loading wavs from: {c.data_path}") + if c.feature_path is not None: + print(f" > Loading features from: {c.feature_path}") eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size - ) + c.data_path, c.feature_path, c.eval_split_size) else: - mel_feat_path = os.path.join(OUT_PATH, "mel") - feat_data = find_feat_files(mel_feat_path) - if feat_data: - print(f" > Loading features from: {mel_feat_path}") - eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size - ) - else: - print(" > No feature data found. Preprocessing...") - # preprocessing feature data from given wav files - preprocess_wav_files(OUT_PATH, CONFIG, ap) - eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size - ) + eval_data, train_data = load_wav_data( + c.data_path, c.eval_split_size) # setup model - model_wavernn = setup_wavernn(CONFIG) + model_wavernn = setup_wavernn(c) # define train functions - if CONFIG.mode == "mold": + if c.mode == "mold": criterion = discretized_mix_logistic_loss - elif CONFIG.mode == "gauss": + elif c.mode == "gauss": criterion = gaussian_loss - elif isinstance(CONFIG.mode, int): + elif isinstance(c.mode, int): criterion = torch.nn.CrossEntropyLoss() if use_cuda: model_wavernn.cuda() - if isinstance(CONFIG.mode, int): + if isinstance(c.mode, int): criterion.cuda() - optimizer = RAdam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + optimizer = RAdam(model_wavernn.parameters(), lr=c.lr, weight_decay=0) scheduler = None - if "lr_scheduler" in CONFIG: - scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) - scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params) + if "lr_scheduler" in c: + scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) + scheduler = scheduler(optimizer, **c.lr_scheduler_params) # slow start for the first 5 epochs - # lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1) + # lr_lambda = lambda epoch: min(epoch / c.warmup_steps, 1) # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) # restore any checkpoint @@ -366,7 +375,7 @@ def main(args): # pylint: disable=redefined-outer-name # retore only matching layers. print(" > Partial model initialization...") model_dict = model_wavernn.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model_wavernn.load_state_dict(model_dict) print(" > Model restored from step %d" % @@ -386,11 +395,10 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float("inf") global_step = args.restore_step - for epoch in range(0, CONFIG.epochs): - c_logger.print_epoch_start(epoch, CONFIG.epochs) - _, global_step = train( - model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch - ) + for epoch in range(0, c.epochs): + c_logger.print_epoch_start(epoch, c.epochs) + _, global_step = train(model_wavernn, optimizer, + criterion, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate( model_wavernn, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) @@ -462,14 +470,14 @@ if __name__ == "__main__": print(f" > Training continues for {args.restore_path}") # setup output paths and read configs - CONFIG = load_config(args.config_path) + c = load_config(args.config_path) # check_config(c) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path if args.continue_path == "": OUT_PATH = create_experiment_folder( - CONFIG.output_path, CONFIG.run_name, args.debug + c.output_path, c.run_name, args.debug ) AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") @@ -483,7 +491,7 @@ if __name__ == "__main__": new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() copy_config_file( - args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + args.config_path, os.path.join(OUT_PATH, "c.json"), new_fields ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) @@ -492,8 +500,7 @@ if __name__ == "__main__": tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text("model-description", - CONFIG["run_description"], 0) + tb_logger.tb_add_text("model-description", c["run_description"], 0) try: main(args) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 8e6a8c32..9a9fbdae 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,94 +1,97 @@ { "run_name": "wavernn_test", "run_description": "wavernn_test training", - - // AUDIO PARAMETERS - "audio":{ - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. + +// AUDIO PARAMETERS + "audio": { + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. // Silence trimming - "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. - + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, - - // Generating / Synthesizing - "batched": true, - "target_samples": 11000, // target number of samples to be generated in each batch entry - "overlap_samples": 550, // number of samples for crossfading between batches - + +// Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches // DISTRIBUTED TRAINING // "distributed":{ // "backend": "nccl", // "url": "tcp:\/\/localhost:54321" // }, - - // MODEL PARAMETERS - "use_aux_net": true, - "use_upsample_net": true, - "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length - "seq_len": 1280, // has to be devideable by hop_length - "mode": "mold", // mold [string], gauss [string], bits [int] - "mulaw": false, // apply mulaw if mode is bits - "padding": 2, // pad the input for resnet to see wider input length - // DATASET - //"use_gta": true, // use computed gta features from the tts model - "data_path": "path/to/wav/files", // path containing training wav files - "feature_path": null, // path containing computed features from wav files if null compute them +// MODEL MODE + "mode": 10, // mold [string], gauss [string], bits [int] + "mulaw": true, // apply mulaw if mode is bits + +// MODEL PARAMETERS + "wavernn_model_params": { + "rnn_dims": 512, + "fc_dims": 512, + "compute_dims": 128, + "res_out_dims": 128, + "num_res_blocks": 10, + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8] // this needs to correctly factorise hop_length + }, + +// DATASET + //"use_gta": true, // use computed gta features from the tts model + "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them + "seq_len": 1280, // has to be devideable by hop_length + "padding": 2, // pad the input for resnet to see wider input length + +// TRAINING + "batch_size": 64, // Batch size for training. + "epochs": 10000, // total number of epochs to train. - // TRAINING - "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. - "epochs": 10000, // total number of epochs to train. - - // VALIDATION +// VALIDATION "run_eval": true, - "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) - - // OPTIMIZER - "grad_clip": 4, // apply gradient clipping if > 0 - "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "test_every_epochs": 10, // Test after set number of epochs (Test every 10 epochs for example) + +// OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "lr_scheduler_params": { "gamma": 0.5, "milestones": [200000, 400000, 600000] }, - "lr": 1e-4, // initial learning rate - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. - "print_eval": false, // If True, it prints loss values for each step in eval run. - "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 50, // number of samples for testing - - // PATHS + "lr": 1e-4, // initial learning rate + +// TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + +// DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 50, // number of samples for testing + +// PATHS "output_path": "output/training/path" } - diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 194344a9..3dbb2194 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,11 +1,13 @@ import torch import numpy as np from torch.utils.data import Dataset +from multiprocessing import Manager class WaveRNNDataset(Dataset): """ - WaveRNN Dataset searchs for all the wav files under root path. + WaveRNN Dataset searchs for all the wav files under root path + and converts them to acoustic features on the fly. """ def __init__(self, @@ -15,16 +17,19 @@ class WaveRNNDataset(Dataset): hop_len, pad, mode, + mulaw, is_training=True, verbose=False, ): self.ap = ap + self.compute_feat = not isinstance(items[0], (tuple, list)) self.item_list = items self.seq_len = seq_len self.hop_len = hop_len self.pad = pad self.mode = mode + self.mulaw = mulaw self.is_training = is_training self.verbose = verbose @@ -36,22 +41,47 @@ class WaveRNNDataset(Dataset): return item def load_item(self, index): - wavpath, feat_path = self.item_list[index] - m = np.load(feat_path.replace("/quant/", "/mel/")) - # x = self.wav_cache[index] - if m.shape[-1] < 5: - print(" [!] Instance is too short! : {}".format(wavpath)) - self.item_list[index] = self.item_list[index + 1] - feat_path = self.item_list[index] - m = np.load(feat_path.replace("/quant/", "/mel/")) - if self.mode in ["gauss", "mold"]: - # x = np.load(feat_path.replace("/mel/", "/quant/")) - x = self.ap.load_wav(wavpath) - elif isinstance(self.mode, int): - x = np.load(feat_path.replace("/mel/", "/quant/")) + """ + load (audio, feat) couple if feature_path is set + else compute it on the fly + """ + if self.compute_feat: + + wavpath = self.item_list[index] + audio = self.ap.load_wav(wavpath) + mel = self.ap.melspectrogram(audio) + + if mel.shape[-1] < 5: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + audio = self.ap.load_wav(wavpath) + mel = self.ap.melspectrogram(audio) + if self.mode in ["gauss", "mold"]: + x_input = audio + elif isinstance(self.mode, int): + x_input = (self.ap.mulaw_encode(audio, qc=self.mode) + if self.mulaw else self.ap.quantize(audio, bits=self.mode)) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + else: - raise RuntimeError("Unknown dataset mode - ", self.mode) - return m, x + + wavpath, feat_path = self.item_list[index] + mel = np.load(feat_path.replace("/quant/", "/mel/")) + + if mel.shape[-1] < 5: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + feat_path = self.item_list[index] + mel = np.load(feat_path.replace("/quant/", "/mel/")) + if self.mode in ["gauss", "mold"]: + x_input = self.ap.load_wav(wavpath) + elif isinstance(self.mode, int): + x_input = np.load(feat_path.replace("/mel/", "/quant/")) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + + return mel, x_input def collate(self, batch): mel_win = self.seq_len // self.hop_len + 2 * self.pad @@ -79,10 +109,8 @@ class WaveRNNDataset(Dataset): elif isinstance(self.mode, int): coarse = np.stack(coarse).astype(np.int64) coarse = torch.LongTensor(coarse) - x_input = ( - 2 * coarse[:, : self.seq_len].float() / - (2 ** self.mode - 1.0) - 1.0 - ) + x_input = (2 * coarse[:, : self.seq_len].float() / + (2 ** self.mode - 1.0) - 1.0) y_coarse = coarse[:, 1:] mels = torch.FloatTensor(mels) return x_input, mels, y_coarse diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 8a45d9e3..f771175c 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -36,14 +36,14 @@ class ResBlock(nn.Module): class MelResNet(nn.Module): - def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): + def __init__(self, num_res_blocks, in_dims, compute_dims, res_out_dims, pad): super().__init__() k_size = pad * 2 + 1 self.conv_in = nn.Conv1d( in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() - for _ in range(res_blocks): + for _ in range(num_res_blocks): self.layers.append(ResBlock(compute_dims)) self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) @@ -76,7 +76,7 @@ class UpsampleNetwork(nn.Module): feat_dims, upsample_scales, compute_dims, - res_blocks, + num_res_blocks, res_out_dims, pad, use_aux_net, @@ -87,7 +87,7 @@ class UpsampleNetwork(nn.Module): self.use_aux_net = use_aux_net if use_aux_net: self.resnet = MelResNet( - res_blocks, feat_dims, compute_dims, res_out_dims, pad + num_res_blocks, feat_dims, compute_dims, res_out_dims, pad ) self.resnet_stretch = Stretch2d(self.total_scale, 1) self.up_layers = nn.ModuleList() @@ -118,14 +118,14 @@ class UpsampleNetwork(nn.Module): class Upsample(nn.Module): def __init__( - self, scale, pad, res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net + self, scale, pad, num_res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net ): super().__init__() self.scale = scale self.pad = pad self.indent = pad * scale self.use_aux_net = use_aux_net - self.resnet = MelResNet(res_blocks, feat_dims, + self.resnet = MelResNet(num_res_blocks, feat_dims, compute_dims, res_out_dims, pad) def forward(self, m): @@ -147,23 +147,22 @@ class Upsample(nn.Module): class WaveRNN(nn.Module): - def __init__( - self, - rnn_dims, - fc_dims, - mode, - mulaw, - pad, - use_aux_net, - use_upsample_net, - upsample_factors, - feat_dims, - compute_dims, - res_out_dims, - res_blocks, - hop_length, - sample_rate, - ): + def __init__(self, + rnn_dims, + fc_dims, + mode, + mulaw, + pad, + use_aux_net, + use_upsample_net, + upsample_factors, + feat_dims, + compute_dims, + res_out_dims, + num_res_blocks, + hop_length, + sample_rate, + ): super().__init__() self.mode = mode self.mulaw = mulaw @@ -177,7 +176,7 @@ class WaveRNN(nn.Module): elif self.mode == "gauss": self.n_classes = 2 else: - raise RuntimeError(" > Unknown training mode") + raise RuntimeError("Unknown model mode value - ", self.mode) self.rnn_dims = rnn_dims self.aux_dims = res_out_dims // 4 @@ -192,7 +191,7 @@ class WaveRNN(nn.Module): feat_dims, upsample_factors, compute_dims, - res_blocks, + num_res_blocks, res_out_dims, pad, use_aux_net, @@ -201,7 +200,7 @@ class WaveRNN(nn.Module): self.upsample = Upsample( hop_length, pad, - res_blocks, + num_res_blocks, feat_dims, compute_dims, res_out_dims, @@ -260,7 +259,7 @@ class WaveRNN(nn.Module): x = F.relu(self.fc2(x)) return self.fc3(x) - def generate(self, mels, batched, target, overlap, use_cuda): + def generate(self, mels, batched, target, overlap, use_cuda=False): self.eval() device = 'cuda' if use_cuda else 'cpu' @@ -360,7 +359,9 @@ class WaveRNN(nn.Module): # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] - output[-20 * self.hop_length:] *= fade_out + + if wave_len > len(fade_out): + output[-20 * self.hop_length:] *= fade_out self.train() return output @@ -405,7 +406,8 @@ class WaveRNN(nn.Module): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side="after") - folded = torch.zeros(num_folds, target + 2 * overlap, features).to(x.device) + folded = torch.zeros(num_folds, target + 2 * + overlap, features).to(x.device) # Get the values for the folded tensor for i in range(num_folds): From 80f5e39e56fe862eba0248398d4c755232a70d60 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 09:47:04 +0100 Subject: [PATCH 16/98] add model params to config --- TTS/vocoder/utils/generic_utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index c16fa1ae..f9fbba52 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -47,18 +47,18 @@ def setup_wavernn(c): MyModel = importlib.import_module("TTS.vocoder.models.wavernn") MyModel = getattr(MyModel, "WaveRNN") model = MyModel( - rnn_dims=512, - fc_dims=512, + rnn_dims=c.wavernn_model_params['rnn_dims'], + fc_dims=c.wavernn_model_params['fc_dims'], mode=c.mode, mulaw=c.mulaw, pad=c.padding, - use_aux_net=c.use_aux_net, - use_upsample_net=c.use_upsample_net, - upsample_factors=c.upsample_factors, - feat_dims=80, - compute_dims=128, - res_out_dims=128, - res_blocks=10, + use_aux_net=c.wavernn_model_params['use_aux_net'], + use_upsample_net=c.wavernn_model_params['use_upsample_net'], + upsample_factors=c.wavernn_model_params['upsample_factors'], + feat_dims=c.audio['num_mels'], + compute_dims=c.wavernn_model_params['compute_dims'], + res_out_dims=c.wavernn_model_params['res_out_dims'], + num_res_blocks=c.wavernn_model_params['num_res_blocks'], hop_length=c.audio["hop_length"], sample_rate=c.audio["sample_rate"], ) From d158ec0806d545d7a053542670e0c02969c89503 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 10:04:24 +0100 Subject: [PATCH 17/98] fix pylint once again --- TTS/bin/train_wavernn_vocoder.py | 1 - TTS/vocoder/datasets/wavernn_dataset.py | 1 - tests/test_vocoder_wavernn.py | 2 +- tests/test_vocoder_wavernn_datasets.py | 17 +++++++++-------- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 91a62cbe..61664a65 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -28,7 +28,6 @@ from TTS.utils.generic_utils import ( ) from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( - find_feat_files, load_wav_data, load_wav_feat_data ) diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 3dbb2194..9c1ded96 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,7 +1,6 @@ import torch import numpy as np from torch.utils.data import Dataset -from multiprocessing import Manager class WaveRNNDataset(Dataset): diff --git a/tests/test_vocoder_wavernn.py b/tests/test_vocoder_wavernn.py index fdb338f9..ccd71c56 100644 --- a/tests/test_vocoder_wavernn.py +++ b/tests/test_vocoder_wavernn.py @@ -17,7 +17,7 @@ def test_wavernn(): feat_dims=80, compute_dims=128, res_out_dims=128, - res_blocks=10, + num_res_blocks=10, hop_length=256, sample_rate=22050, ) diff --git a/tests/test_vocoder_wavernn_datasets.py b/tests/test_vocoder_wavernn_datasets.py index 0f4e939a..a95e247a 100644 --- a/tests/test_vocoder_wavernn_datasets.py +++ b/tests/test_vocoder_wavernn_datasets.py @@ -23,7 +23,7 @@ test_quant_feat_path = os.path.join(test_data_path, "quant") ok_ljspeech = os.path.exists(test_data_path) -def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): +def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers): """ run dataloader with given parameters and check conditions """ ap = AudioProcessor(**C.audio) @@ -42,6 +42,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): hop_len=hop_len, pad=pad, mode=mode, + mulaw=mulaw ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, @@ -78,13 +79,13 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): def test_parametrized_wavernn_dataset(): ''' test dataloader with different parameters ''' params = [ - [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, 0], - [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", 4], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, 0], - [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, 0], - [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", 0], - [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, 2], - [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, True, 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", False, 4], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, False, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, True, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", False, 0], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, False, 2], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", False, 0], ] for param in params: print(param) From d9540a5857d79dcfd260c776988cd03ad6d02b2a Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 25 Oct 2020 15:08:28 -0300 Subject: [PATCH 18/98] add blank token in sequence for encrease glowtts results --- TTS/bin/train_glow_tts.py | 1 + TTS/bin/train_tts.py | 1 + TTS/tts/configs/glow_tts_gated_conv.json | 2 ++ TTS/tts/configs/glow_tts_tdsep.json | 2 ++ TTS/tts/datasets/TTSDataset.py | 6 ++++-- TTS/tts/utils/synthesis.py | 7 +++++-- TTS/tts/utils/text/__init__.py | 14 +++++++++++--- 7 files changed, 26 insertions(+), 7 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 7ffca36e..f4d04abb 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -47,6 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None): meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, tp=c.characters if 'characters' in c.keys() else None, + add_blank=c['add_blank'] if 'add_blank' in c.keys() else False, batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, min_seq_len=c.min_seq_len, diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 4c615b99..e4f8bf7a 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -51,6 +51,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None): meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, tp=c.characters if 'characters' in c.keys() else None, + add_blank=c['add_blank'] if 'add_blank' in c.keys() else False, batch_group_size=0 if is_val else c.batch_group_size * c.batch_size, min_seq_len=c.min_seq_len, diff --git a/TTS/tts/configs/glow_tts_gated_conv.json b/TTS/tts/configs/glow_tts_gated_conv.json index 696bdaf7..5c30e0bc 100644 --- a/TTS/tts/configs/glow_tts_gated_conv.json +++ b/TTS/tts/configs/glow_tts_gated_conv.json @@ -51,6 +51,8 @@ // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" // }, + "add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model. + // DISTRIBUTED TRAINING "distributed":{ "backend": "nccl", diff --git a/TTS/tts/configs/glow_tts_tdsep.json b/TTS/tts/configs/glow_tts_tdsep.json index 67047523..25d41291 100644 --- a/TTS/tts/configs/glow_tts_tdsep.json +++ b/TTS/tts/configs/glow_tts_tdsep.json @@ -51,6 +51,8 @@ // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" // }, + "add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model. + // DISTRIBUTED TRAINING "distributed":{ "backend": "nccl", diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index ab8f3f88..7b671397 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -17,6 +17,7 @@ class MyDataset(Dataset): ap, meta_data, tp=None, + add_blank=False, batch_group_size=0, min_seq_len=0, max_seq_len=float("inf"), @@ -55,6 +56,7 @@ class MyDataset(Dataset): self.max_seq_len = max_seq_len self.ap = ap self.tp = tp + self.add_blank = add_blank self.use_phonemes = use_phonemes self.phoneme_cache_path = phoneme_cache_path self.phoneme_language = phoneme_language @@ -88,7 +90,7 @@ class MyDataset(Dataset): phonemes = phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language, enable_eos_bos=False, - tp=self.tp) + tp=self.tp, add_blank=self.add_blank) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) return phonemes @@ -127,7 +129,7 @@ class MyDataset(Dataset): text = self._load_or_generate_phoneme_sequence(wav_file, text) else: text = np.asarray(text_to_sequence(text, [self.cleaners], - tp=self.tp), + tp=self.tp, add_blank=self.add_blank), dtype=np.int32) assert text.size > 0, self.items[idx][1] diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 0dfea5cc..3d2dd13c 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -14,10 +14,13 @@ def text_to_seqvec(text, CONFIG): seq = np.asarray( phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, - tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), + tp=CONFIG.characters if 'characters' in CONFIG.keys() else None, + add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False), dtype=np.int32) else: - seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32) + seq = np.asarray( + text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None, + add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False), dtype=np.int32) return seq diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 33972f25..eab7a689 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -57,6 +57,10 @@ def text2phone(text, language): return ph +def intersperse(sequence, token): + result = [token] * (len(sequence) * 2 + 1) + result[1::2] = sequence + return result def pad_with_eos_bos(phoneme_sequence, tp=None): # pylint: disable=global-statement @@ -69,8 +73,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None): return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] - -def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None): +def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False): # pylint: disable=global-statement global _phonemes_to_id if tp: @@ -88,6 +91,8 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= # Append EOS char if enable_eos_bos: sequence = pad_with_eos_bos(sequence, tp=tp) + if add_blank: + sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes) return sequence @@ -107,7 +112,7 @@ def sequence_to_phoneme(sequence, tp=None): return result.replace('}{', ' ') -def text_to_sequence(text, cleaner_names, tp=None): +def text_to_sequence(text, cleaner_names, tp=None, add_blank=False): '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. The text can optionally have ARPAbet sequences enclosed in curly braces embedded @@ -137,6 +142,9 @@ def text_to_sequence(text, cleaner_names, tp=None): _clean_text(m.group(1), cleaner_names)) sequence += _arpabet_to_sequence(m.group(2)) text = m.group(3) + + if add_blank: + sequence = intersperse(sequence, len(_symbols)) # add a blank token (new), whose id number is len(_symbols) return sequence From e98215600919b915631c39b079537be18930bf67 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 26 Oct 2020 17:16:16 +0100 Subject: [PATCH 19/98] small updates --- TTS/bin/train_wavernn_vocoder.py | 1 - TTS/vocoder/configs/wavernn_config.json | 30 ++++++++++++------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 61664a65..90e30256 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -95,7 +95,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): end_time = time.time() c_logger.print_train_start() # train loop - print(" > Training", flush=True) for num_iter, data in enumerate(data_loader): start_time = time.time() x_input, mels, y_coarse = format_data(data) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 9a9fbdae..8f290b80 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,7 +1,7 @@ { "run_name": "wavernn_test", "run_description": "wavernn_test training", - + // AUDIO PARAMETERS "audio": { "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. @@ -29,7 +29,7 @@ "clip_norm": true, // clip normalized values into the range. "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, - + // Generating / Synthesizing "batched": true, "target_samples": 11000, // target number of samples to be generated in each batch entry @@ -39,11 +39,11 @@ // "backend": "nccl", // "url": "tcp:\/\/localhost:54321" // }, - -// MODEL MODE - "mode": 10, // mold [string], gauss [string], bits [int] + +// MODEL MODE + "mode": "mold", // mold [string], gauss [string], bits [int] "mulaw": true, // apply mulaw if mode is bits - + // MODEL PARAMETERS "wavernn_model_params": { "rnn_dims": 512, @@ -55,14 +55,14 @@ "use_upsample_net": true, "upsample_factors": [4, 8, 8] // this needs to correctly factorise hop_length }, - + // DATASET //"use_gta": true, // use computed gta features from the tts model - "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech", // path containing training wav files + "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/", // path containing training wav files "feature_path": null, // path containing computed features from wav files if null compute them "seq_len": 1280, // has to be devideable by hop_length "padding": 2, // pad the input for resnet to see wider input length - + // TRAINING "batch_size": 64, // Batch size for training. "epochs": 10000, // total number of epochs to train. @@ -70,7 +70,7 @@ // VALIDATION "run_eval": true, "test_every_epochs": 10, // Test after set number of epochs (Test every 10 epochs for example) - + // OPTIMIZER "grad_clip": 4, // apply gradient clipping if > 0 "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate @@ -79,19 +79,19 @@ "milestones": [200000, 400000, 600000] }, "lr": 1e-4, // initial learning rate - + // TENSORBOARD and LOGGING "print_step": 25, // Number of steps to log traning on console. "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - + // DATA LOADING "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 50, // number of samples for testing - + "eval_split_size": 50, // number of samples for testing + // PATHS - "output_path": "output/training/path" + "output_path": "/home/erogol/Models/LJSpeech/" } From 89e9bfe3a2a5e0cac2180078fa79fa934d8cc4ba Mon Sep 17 00:00:00 2001 From: Edresson Date: Mon, 26 Oct 2020 17:41:23 -0300 Subject: [PATCH 20/98] add text processing blank token test --- TTS/tts/utils/text/__init__.py | 19 +++++--- tests/test_text_processing.py | 81 +++++++++++++++++++++++++++++++++- 2 files changed, 93 insertions(+), 7 deletions(-) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index eab7a689..29f4af1d 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -16,6 +16,8 @@ _id_to_symbol = {i: s for i, s in enumerate(symbols)} _phonemes_to_id = {s: i for i, s in enumerate(phonemes)} _id_to_phonemes = {i: s for i, s in enumerate(phonemes)} +_symbols = symbols +_phonemes = phonemes # Regular expression matching text enclosed in curly braces: _CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') @@ -75,7 +77,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None): def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False): # pylint: disable=global-statement - global _phonemes_to_id + global _phonemes_to_id, _phonemes if tp: _, _phonemes = make_symbols(**tp) _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} @@ -96,10 +98,12 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= return sequence -def sequence_to_phoneme(sequence, tp=None): +def sequence_to_phoneme(sequence, tp=None, add_blank=False): # pylint: disable=global-statement '''Converts a sequence of IDs back to a string''' - global _id_to_phonemes + global _id_to_phonemes, _phonemes + if add_blank: + sequence = list(filter(lambda x: x != len(_phonemes), sequence)) result = '' if tp: _, _phonemes = make_symbols(**tp) @@ -126,7 +130,7 @@ def text_to_sequence(text, cleaner_names, tp=None, add_blank=False): List of integers corresponding to the symbols in the text ''' # pylint: disable=global-statement - global _symbol_to_id + global _symbol_to_id, _symbols if tp: _symbols, _ = make_symbols(**tp) _symbol_to_id = {s: i for i, s in enumerate(_symbols)} @@ -148,10 +152,13 @@ def text_to_sequence(text, cleaner_names, tp=None, add_blank=False): return sequence -def sequence_to_text(sequence, tp=None): +def sequence_to_text(sequence, tp=None, add_blank=False): '''Converts a sequence of IDs back to a string''' # pylint: disable=global-statement - global _id_to_symbol + global _id_to_symbol, _symbols + if add_blank: + sequence = list(filter(lambda x: x != len(_symbols), sequence)) + if tp: _symbols, _ = make_symbols(**tp) _id_to_symbol = {i: s for i, s in enumerate(_symbols)} diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 1eb9f9a8..ae3250a8 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -11,6 +11,7 @@ from TTS.utils.io import load_config conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) def test_phoneme_to_sequence(): + text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" text_cleaner = ["phoneme_cleaners"] lang = "en-us" @@ -20,7 +21,7 @@ def test_phoneme_to_sequence(): text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" assert text_hat == text_hat_with_params == gt - + # multiple punctuations text = "Be a voice, not an! echo?" sequence = phoneme_to_sequence(text, text_cleaner, lang) @@ -87,6 +88,84 @@ def test_phoneme_to_sequence(): print(len(sequence)) assert text_hat == text_hat_with_params == gt +def test_phoneme_to_sequence_with_blank_token(): + + text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" + text_cleaner = ["phoneme_cleaners"] + lang = "en-us" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" + assert text_hat == text_hat_with_params == gt + + # multiple punctuations + text = "Be a voice, not an! echo?" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # not ending with punctuation + text = "Be a voice, not an! echo" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # original + text = "Be a voice, not an echo!" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence(text, text_cleaner, lang, True) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # padding char + text = "_Be a _voice, not an! echo_" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + def test_text2phone(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" From 3894d44dc5ad7498394cc0222e1b8035b3ee76c5 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 27 Oct 2020 12:14:27 +0100 Subject: [PATCH 21/98] update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0127e84d..2893036a 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ args, unknown_args = parser.parse_known_args() # Remove our arguments from argv so that setuptools doesn't see them sys.argv = [sys.argv[0]] + unknown_args -version = '0.0.5' +version = '0.0.6' # Adapted from https://github.com/pytorch/pytorch cwd = os.path.dirname(os.path.abspath(__file__)) From 5ce04832ce750f029bbcc11076bae9674afa4adb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 25 Sep 2020 19:19:13 +0000 Subject: [PATCH 22/98] Bump tensorflow from 2.3.0 to 2.3.1 Bumps [tensorflow](https://github.com/tensorflow/tensorflow) from 2.3.0 to 2.3.1. - [Release notes](https://github.com/tensorflow/tensorflow/releases) - [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorflow/compare/v2.3.0...v2.3.1) Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- requirements_tests.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 36387e4d..dda9dcb5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch>=1.5 -tensorflow==2.3.0 +tensorflow==2.3.1 numpy>=1.16.0 scipy>=0.19.0 numba==0.48 diff --git a/requirements_tests.txt b/requirements_tests.txt index 4e7f3435..5b833858 100644 --- a/requirements_tests.txt +++ b/requirements_tests.txt @@ -1,5 +1,5 @@ torch>=1.5 -tensorflow==2.3.0 +tensorflow==2.3.1 numpy>=1.16.0 scipy>=0.19.0 numba==0.48 From 6378fa2b075bb9d37220350b7aca2cfb2f74d3b0 Mon Sep 17 00:00:00 2001 From: Alex K Date: Thu, 15 Oct 2020 19:14:50 +0200 Subject: [PATCH 23/98] add initial wavernn support --- TTS/bin/compute_statistics.py | 53 +- ...{train_vocoder.py => train_gan_vocoder.py} | 344 ++++++------ TTS/bin/train_wavernn_vocoder.py | 493 ++++++++++++++++++ TTS/vocoder/configs/wavernn_config.json | 95 ++++ TTS/vocoder/datasets/preprocess.py | 8 +- TTS/vocoder/utils/generic_utils.py | 20 + 6 files changed, 838 insertions(+), 175 deletions(-) rename TTS/bin/{train_vocoder.py => train_gan_vocoder.py} (68%) create mode 100644 TTS/bin/train_wavernn_vocoder.py create mode 100644 TTS/vocoder/configs/wavernn_config.json diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 1c6ef94d..9177c75b 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -11,20 +11,27 @@ from TTS.tts.datasets.preprocess import load_meta_data from TTS.utils.io import load_config from TTS.utils.audio import AudioProcessor + def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( - description="Compute mean and variance of spectrogtram features.") - parser.add_argument("--config_path", type=str, required=True, - help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", default=None, type=str, - help="directory to save the output file.") + description="Compute mean and variance of spectrogtram features." + ) + parser.add_argument( + "--config_path", + type=str, + required=True, + help="TTS config file path to define audio processin parameters.", + ) + parser.add_argument( + "--out_path", default=None, type=str, help="directory to save the output file." + ) args = parser.parse_args() # load config CONFIG = load_config(args.config_path) - CONFIG.audio['signal_norm'] = False # do not apply earlier normalization - CONFIG.audio['stats_path'] = None # discard pre-defined stats + CONFIG.audio["signal_norm"] = False # do not apply earlier normalization + CONFIG.audio["stats_path"] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) @@ -58,27 +65,27 @@ def main(): output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} - stats['mel_mean'] = mel_mean - stats['mel_std'] = mel_scale - stats['linear_mean'] = linear_mean - stats['linear_std'] = linear_scale + stats["mel_mean"] = mel_mean + stats["mel_std"] = mel_scale + stats["linear_mean"] = linear_mean + stats["linear_std"] = linear_scale - print(f' > Avg mel spec mean: {mel_mean.mean()}') - print(f' > Avg mel spec scale: {mel_scale.mean()}') - print(f' > Avg linear spec mean: {linear_mean.mean()}') - print(f' > Avg lienar spec scale: {linear_scale.mean()}') + print(f" > Avg mel spec mean: {mel_mean.mean()}") + print(f" > Avg mel spec scale: {mel_scale.mean()}") + print(f" > Avg linear spec mean: {linear_mean.mean()}") + print(f" > Avg lienar spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling - CONFIG.audio['stats_path'] = output_file_path - CONFIG.audio['signal_norm'] = True + CONFIG.audio["stats_path"] = output_file_path + CONFIG.audio["signal_norm"] = True # remove redundant values - del CONFIG.audio['max_norm'] - del CONFIG.audio['min_level_db'] - del CONFIG.audio['symmetric_norm'] - del CONFIG.audio['clip_norm'] - stats['audio_config'] = CONFIG.audio + del CONFIG.audio["max_norm"] + del CONFIG.audio["min_level_db"] + del CONFIG.audio["symmetric_norm"] + del CONFIG.audio["clip_norm"] + stats["audio_config"] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) - print(f' > scale_stats.npy is saved to {output_file_path}') + print(f" > scale_stats.npy is saved to {output_file_path}") if __name__ == "__main__": diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_gan_vocoder.py similarity index 68% rename from TTS/bin/train_vocoder.py rename to TTS/bin/train_gan_vocoder.py index b51a55a3..7689c930 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_gan_vocoder.py @@ -10,20 +10,29 @@ import torch from torch.utils.data import DataLoader from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict) +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, +) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data + # from distribute import (DistributedSampler, apply_gradient_allreduce, # init_distributed, reduce_tensor) from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, - setup_generator) +from TTS.vocoder.utils.generic_utils import ( + plot_results, + setup_discriminator, + setup_generator, +) from TTS.vocoder.utils.io import save_best_model, save_checkpoint use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -33,27 +42,30 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not c.run_eval: loader = None else: - dataset = GANDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose) + dataset = GANDataset( + ap=ap, + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=not is_val, + use_noise_augment=c.use_noise_augment, + use_cache=c.use_cache, + verbose=verbose, + ) dataset.shuffle_mapping() # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader(dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=True, - drop_last=False, - sampler=None, - num_workers=c.num_val_loader_workers - if is_val else c.num_loader_workers, - pin_memory=False) + loader = DataLoader( + dataset, + batch_size=1 if is_val else c.batch_size, + shuffle=True, + drop_last=False, + sampler=None, + num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, + pin_memory=False, + ) return loader @@ -80,16 +92,26 @@ def format_data(data): return co, x, None, None -def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, - scheduler_G, scheduler_D, ap, global_step, epoch): +def train( + model_G, + criterion_G, + optimizer_G, + model_D, + criterion_D, + optimizer_D, + scheduler_G, + scheduler_D, + ap, + global_step, + epoch, +): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model_G.train() model_D.train() epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int( - len(data_loader.dataset) / (c.batch_size * num_gpus)) + batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() @@ -145,16 +167,16 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, scores_fake = D_out_fake # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) - loss_G = loss_G_dict['G_loss'] + loss_G_dict = criterion_G( + y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub + ) + loss_G = loss_G_dict["G_loss"] # optimizer generator optimizer_G.zero_grad() loss_G.backward() if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), - c.gen_clip_grad) + torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) optimizer_G.step() if scheduler_G is not None: scheduler_G.step() @@ -199,14 +221,13 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute losses loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict['D_loss'] + loss_D = loss_D_dict["D_loss"] # optimizer discriminator optimizer_D.zero_grad() loss_D.backward() if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), - c.disc_clip_grad) + torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) optimizer_D.step() if scheduler_D is not None: scheduler_D.step() @@ -221,34 +242,40 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, epoch_time += step_time # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]['lr'] - current_lr_D = list(optimizer_D.param_groups)[0]['lr'] + current_lr_G = list(optimizer_G.param_groups)[0]["lr"] + current_lr_D = list(optimizer_D.param_groups)[0]["lr"] # update avg stats update_train_values = dict() for key, value in loss_dict.items(): - update_train_values['avg_' + key] = value - update_train_values['avg_loader_time'] = loader_time - update_train_values['avg_step_time'] = step_time + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time keep_avg.update_values(update_train_values) # print training stats if global_step % c.print_step == 0: log_dict = { - 'step_time': [step_time, 2], - 'loader_time': [loader_time, 4], + "step_time": [step_time, 2], + "loader_time": [loader_time, 4], "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D + "current_lr_D": current_lr_D, } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, - log_dict, loss_dict, keep_avg.avg_values) + c_logger.print_train_step( + batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) # plot step stats if global_step % 10 == 0: iter_stats = { "lr_G": current_lr_G, "lr_D": current_lr_D, - "step_time": step_time + "step_time": step_time, } iter_stats.update(loss_dict) tb_logger.tb_train_iter_stats(global_step, iter_stats) @@ -257,27 +284,28 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict) + save_checkpoint( + model_G, + optimizer_G, + scheduler_G, + model_D, + optimizer_D, + scheduler_D, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + tb_logger.tb_train_audios( + global_step, {"train/audio": sample_voice}, c.audio["sample_rate"] + ) end_time = time.time() # print epoch stats @@ -326,7 +354,6 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) y_hat = model_G.pqmf_synthesis(y_hat) y_G_sub = model_G.pqmf_analysis(y_G) - scores_fake, feats_fake, feats_real = None, None, None if global_step > c.steps_to_start_discriminator: @@ -352,8 +379,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) feats_fake, feats_real = None, None # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) + loss_G_dict = criterion_G( + y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub + ) loss_dict = dict() for key, value in loss_G_dict.items(): @@ -403,16 +431,15 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) else: loss_dict[key] = value.item() - step_time = time.time() - start_time epoch_time += step_time # update avg stats update_eval_values = dict() for key, value in loss_dict.items(): - update_eval_values['avg_' + key] = value - update_eval_values['avg_loader_time'] = loader_time - update_eval_values['avg_step_time'] = step_time + update_eval_values["avg_" + key] = value + update_eval_values["avg_loader_time"] = loader_time + update_eval_values["avg_step_time"] = step_time keep_avg.update_values(update_eval_values) # print eval stats @@ -420,13 +447,14 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, 'eval') + figures = plot_results(y_hat, y_G, ap, global_step, "eval") tb_logger.tb_eval_figures(global_step, figures) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) + tb_logger.tb_eval_audios( + global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"] + ) # synthesize a full voice data_loader.return_segments = False @@ -443,7 +471,9 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + eval_data, train_data = load_wav_feat_data( + c.data_path, c.feature_path, c.eval_split_size + ) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -461,17 +491,15 @@ def main(args): # pylint: disable=redefined-outer-name # setup optimizers optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0) - optimizer_disc = RAdam(model_disc.parameters(), - lr=c.lr_disc, - weight_decay=0) + optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0) # schedulers scheduler_gen = None scheduler_disc = None - if 'lr_scheduler_gen' in c: + if "lr_scheduler_gen" in c: scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if 'lr_scheduler_disc' in c: + if "lr_scheduler_disc" in c: scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) @@ -480,47 +508,46 @@ def main(args): # pylint: disable=redefined-outer-name criterion_disc = DiscriminatorLoss(c) if args.restore_path: - checkpoint = torch.load(args.restore_path, map_location='cpu') + checkpoint = torch.load(args.restore_path, map_location="cpu") try: print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint['model']) + model_gen.load_state_dict(checkpoint["model"]) print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint['optimizer']) + optimizer_gen.load_state_dict(checkpoint["optimizer"]) print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint['model_disc']) + model_disc.load_state_dict(checkpoint["model_disc"]) print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint['optimizer_disc']) - if 'scheduler' in checkpoint: + optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) + if "scheduler" in checkpoint: print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint['scheduler']) + scheduler_gen.load_state_dict(checkpoint["scheduler"]) # NOTE: Not sure if necessary scheduler_gen.optimizer = optimizer_gen - if 'scheduler_disc' in checkpoint: + if "scheduler_disc" in checkpoint: print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) + scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) scheduler_disc.optimizer = optimizer_disc except RuntimeError: # retore only matching layers. print(" > Partial model initialization...") model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model'], c) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model_gen.load_state_dict(model_dict) model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c) + model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) model_disc.load_state_dict(model_dict) del model_dict # reset lr if not countinuining training. for group in optimizer_gen.param_groups: - group['lr'] = c.lr_gen + group["lr"] = c.lr_gen for group in optimizer_disc.param_groups: - group['lr'] = c.lr_disc + group["lr"] = c.lr_disc - print(" > Model restored from step %d" % checkpoint['step'], - flush=True) - args.restore_step = checkpoint['step'] + print(" > Model restored from step %d" % checkpoint["step"], flush=True) + args.restore_step = checkpoint["step"] else: args.restore_step = 0 @@ -539,75 +566,92 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model_disc) print(" > Discriminator has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): - best_loss = float('inf') + if "best_loss" not in locals(): + best_loss = float("inf") global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model_gen, criterion_gen, optimizer_gen, - model_disc, criterion_disc, optimizer_disc, - scheduler_gen, scheduler_disc, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, - global_step, epoch) + _, global_step = train( + model_gen, + criterion_gen, + optimizer_gen, + model_disc, + criterion_disc, + optimizer_disc, + scheduler_gen, + scheduler_disc, + ap, + global_step, + epoch, + ) + eval_avg_loss_dict = evaluate( + model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch + ) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict) + best_loss = save_best_model( + target_loss, + best_loss, + model_gen, + optimizer_gen, + scheduler_gen, + model_disc, + optimizer_disc, + scheduler_disc, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + ) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--continue_path', + "--continue_path", type=str, - help= - 'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) + help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', + default="", + required="--config_path" not in sys.argv, + ) parser.add_argument( - '--restore_path', + "--restore_path", type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument('--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') + help="Model file to be restored. Use to finetune a model.", + default="", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in sys.argv, + ) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.", + ) # DISTRUBUTED parser.add_argument( - '--rank', + "--rank", type=int, default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') + help="DISTRIBUTED: process rank for distributed training.", + ) + parser.add_argument( + "--group_id", type=str, default="", help="DISTRIBUTED: process group id." + ) args = parser.parse_args() - if args.continue_path != '': + if args.continue_path != "": args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') + args.config_path = os.path.join(args.continue_path, "config.json") list_of_files = glob.glob( - args.continue_path + - "/*.pth.tar") # * means all if need specific format then *.csv + args.continue_path + "/*.pth.tar" + ) # * means all if need specific format then *.csv latest_model_file = max(list_of_files, key=os.path.getctime) args.restore_path = latest_model_file print(f" > Training continues for {args.restore_path}") @@ -618,11 +662,10 @@ if __name__ == '__main__': _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, - args.debug) + if args.continue_path == "": + OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') + AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") c_logger = ConsoleLogger() @@ -632,16 +675,17 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_config_file(args.config_path, - os.path.join(OUT_PATH, 'config.json'), new_fields) + copy_config_file( + args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') + tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + tb_logger.tb_add_text("model-description", c["run_description"], 0) try: main(args) @@ -654,4 +698,4 @@ if __name__ == '__main__': except Exception: # pylint: disable=broad-except remove_experiment_folder(OUT_PATH) traceback.print_exc() - sys.exit(1) + sys.exit(1) \ No newline at end of file diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py new file mode 100644 index 00000000..2f77ab57 --- /dev/null +++ b/TTS/bin/train_wavernn_vocoder.py @@ -0,0 +1,493 @@ +import argparse +import math +import os +import pickle +import shutil +import sys +import traceback +import time +import glob +import random + +import torch +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + + +from TTS.utils.audio import AudioProcessor +from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.io import copy_config_file, load_config +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.utils.tensorboard_logger import TensorboardLogger +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss +from TTS.vocoder.utils.generic_utils import setup_wavernn +from TTS.utils.training import setup_torch_training_env +from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, +) +from TTS.vocoder.utils.io import save_best_model, save_checkpoint + + +use_cuda, num_gpus = setup_torch_training_env(True, True) + + +def setup_loader(ap, is_val=False, verbose=False): + if is_val and not CONFIG.run_eval: + loader = None + else: + dataset = WaveRNNDataset( + ap=ap, + items=eval_data if is_val else train_data, + seq_len=CONFIG.seq_len, + hop_len=ap.hop_length, + pad=CONFIG.padding, + mode=CONFIG.mode, + is_training=not is_val, + verbose=verbose, + ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=CONFIG.batch_size, + num_workers=CONFIG.num_val_loader_workers + if is_val + else CONFIG.num_loader_workers, + pin_memory=True, + ) + return loader + + +def format_data(data): + # setup input data + x = data[0] + m = data[1] + y = data[2] + + # dispatch data to GPU + if use_cuda: + x = x.cuda(non_blocking=True) + m = m.cuda(non_blocking=True) + y = y.cuda(non_blocking=True) + + return x, m, y + + +def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): + # create train loader + data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) + model.train() + epoch_time = 0 + keep_avg = KeepAverage() + if use_cuda: + batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus)) + else: + batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) + end_time = time.time() + c_logger.print_train_start() + # train loop + print(" > Training", flush=True) + for num_iter, data in enumerate(data_loader): + start_time = time.time() + x, m, y = format_data(data) + loader_time = time.time() - end_time + global_step += 1 + + ################## + # MODEL TRAINING # + ################## + y_hat = model(x, m) + y_hat_vis = y_hat # for visualization + + # y_hat = y_hat.transpose(1, 2) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y = y.float() + y = y.unsqueeze(-1) + # m_scaled, _ = model.upsample(m) + + # compute losses + loss = criterion(y_hat, y) + if loss.item() is None: + raise RuntimeError(" [!] None loss. Exiting ...") + optimizer.zero_grad() + loss.backward() + if CONFIG.grad_clip > 0: + torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip) + + optimizer.step() + if scheduler is not None: + scheduler.step() + + # get the current learning rate + cur_lr = list(optimizer.param_groups)[0]["lr"] + + step_time = time.time() - start_time + epoch_time += step_time + + update_train_values = dict() + loss_dict = dict() + loss_dict["model_loss"] = loss.item() + for key, value in loss_dict.items(): + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time + keep_avg.update_values(update_train_values) + + # print training stats + if global_step % CONFIG.print_step == 0: + log_dict = { + "step_time": [step_time, 2], + "loader_time": [loader_time, 4], + "current_lr": cur_lr, + } + c_logger.print_train_step( + batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) + + # plot step stats + if global_step % 10 == 0: + iter_stats = {"lr": cur_lr, "step_time": step_time} + iter_stats.update(loss_dict) + tb_logger.tb_train_iter_stats(global_step, iter_stats) + + # save checkpoint + if global_step % CONFIG.save_step == 0: + if CONFIG.checkpoint: + # save model + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) + + # synthesize a full voice + wav_path = train_data[random.randrange(0, len(train_data))][0] + wav = ap.load_wav(wav_path) + ground_mel = ap.melspectrogram(wav) + sample_wav = model.generate( + ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) + predict_mel = ap.melspectrogram(sample_wav) + + # Sample audio + tb_logger.tb_train_audios( + global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + # compute spectrograms + figures = { + "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), + "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + } + tb_logger.tb_train_figures(global_step, figures) + end_time = time.time() + + # print epoch stats + c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) + + # Plot Training Epoch Stats + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(keep_avg.avg_values) + tb_logger.tb_train_epoch_stats(global_step, epoch_stats) + # TODO: plot model stats + # if c.tb_model_param_stats: + # tb_logger.tb_model_weights(model, global_step) + return keep_avg.avg_values, global_step + + +@torch.no_grad() +def evaluate(model, criterion, ap, global_step, epoch): + # create train loader + data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) + model.eval() + epoch_time = 0 + keep_avg = KeepAverage() + end_time = time.time() + c_logger.print_eval_start() + with torch.no_grad(): + for num_iter, data in enumerate(data_loader): + start_time = time.time() + # format data + x, m, y = format_data(data) + loader_time = time.time() - end_time + global_step += 1 + + y_hat = model(x, m) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y = y.float() + y = y.unsqueeze(-1) + loss = criterion(y_hat, y) + # Compute avg loss + # if num_gpus > 1: + # loss = reduce_tensor(loss.data, num_gpus) + loss_dict = dict() + loss_dict["model_loss"] = loss.item() + + step_time = time.time() - start_time + epoch_time += step_time + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + update_eval_values["avg_loader_time"] = loader_time + update_eval_values["avg_step_time"] = step_time + keep_avg.update_values(update_eval_values) + + # print eval stats + if CONFIG.print_eval: + c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) + + if epoch > CONFIG.test_delay_epochs: + # synthesize a full voice + wav_path = eval_data[random.randrange(0, len(eval_data))][0] + wav = ap.load_wav(wav_path) + ground_mel = ap.melspectrogram(wav) + sample_wav = model.generate( + ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) + predict_mel = ap.melspectrogram(sample_wav) + + # Sample audio + tb_logger.tb_eval_audios( + global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + # compute spectrograms + figures = { + "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), + "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + } + tb_logger.tb_eval_figures(global_step, figures) + + tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) + return keep_avg.avg_values + + +# FIXME: move args definition/parsing inside of main? +def main(args): # pylint: disable=redefined-outer-name + # pylint: disable=global-variable-undefined + global train_data, eval_data + + print(f" > Loading wavs from: {CONFIG.data_path}") + if CONFIG.feature_path is not None: + print(f" > Loading features from: {CONFIG.feature_path}") + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size + ) + eval_data, train_data = eval_data, train_data + else: + eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) + + # setup audio processor + ap = AudioProcessor(**CONFIG.audio) + + # setup model + model_wavernn = setup_wavernn(CONFIG) + + # define train functions + if CONFIG.mode == "mold": + criterion = discretized_mix_logistic_loss + elif CONFIG.mode == "gauss": + criterion = gaussian_loss + elif isinstance(CONFIG.mode, int): + criterion = torch.nn.CrossEntropyLoss() + + if use_cuda: + model_wavernn.cuda() + if isinstance(CONFIG.mode, int): + criterion.cuda() + + optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + scheduler = None + if "lr_scheduler" in CONFIG: + scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) + scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params) + # slow start for the first 5 epochs + # lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1) + # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + + # restore any checkpoint + if args.restore_path: + checkpoint = torch.load(args.restore_path, map_location="cpu") + try: + print(" > Restoring Model...") + model_wavernn.load_state_dict(checkpoint["model"]) + print(" > Restoring Optimizer...") + optimizer.load_state_dict(checkpoint["optimizer"]) + if "scheduler" in checkpoint: + print(" > Restoring Generator LR Scheduler...") + scheduler.load_state_dict(checkpoint["scheduler"]) + scheduler.optimizer = optimizer + # TODO: fix resetting restored optimizer lr + # optimizer.load_state_dict(checkpoint["optimizer"]) + except RuntimeError: + # retore only matching layers. + print(" > Partial model initialization...") + model_dict = model_wavernn.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) + model_wavernn.load_state_dict(model_dict) + + print(" > Model restored from step %d" % checkpoint["step"], flush=True) + args.restore_step = checkpoint["step"] + else: + args.restore_step = 0 + + # DISTRIBUTED + # if num_gpus > 1: + # model = apply_gradient_allreduce(model) + + num_parameters = count_parameters(model_wavernn) + print(" > Model has {} parameters".format(num_parameters), flush=True) + + if "best_loss" not in locals(): + best_loss = float("inf") + + global_step = args.restore_step + for epoch in range(0, CONFIG.epochs): + c_logger.print_epoch_start(epoch, CONFIG.epochs) + _, global_step = train( + model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch + ) + eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) + c_logger.print_epoch_end(epoch, eval_avg_loss_dict) + target_loss = eval_avg_loss_dict["avg_model_loss"] + best_loss = save_best_model( + target_loss, + best_loss, + model_wavernn, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--continue_path", + type=str, + help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', + default="", + required="--config_path" not in sys.argv, + ) + parser.add_argument( + "--restore_path", + type=str, + help="Model file to be restored. Use to finetune a model.", + default="", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in sys.argv, + ) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.", + ) + + # DISTRUBUTED + parser.add_argument( + "--rank", + type=int, + default=0, + help="DISTRIBUTED: process rank for distributed training.", + ) + parser.add_argument( + "--group_id", type=str, default="", help="DISTRIBUTED: process group id." + ) + args = parser.parse_args() + + if args.continue_path != "": + args.output_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + list_of_files = glob.glob( + args.continue_path + "/*.pth.tar" + ) # * means all if need specific format then *.csv + latest_model_file = max(list_of_files, key=os.path.getctime) + args.restore_path = latest_model_file + print(f" > Training continues for {args.restore_path}") + + # setup output paths and read configs + CONFIG = load_config(args.config_path) + # check_config(c) + _ = os.path.dirname(os.path.realpath(__file__)) + + OUT_PATH = args.continue_path + if args.continue_path == "": + OUT_PATH = create_experiment_folder( + CONFIG.output_path, CONFIG.run_name, args.debug + ) + + AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") + + c_logger = ConsoleLogger() + + if args.rank == 0: + os.makedirs(AUDIO_PATH, exist_ok=True) + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + copy_config_file( + args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + ) + os.chmod(AUDIO_PATH, 0o775) + os.chmod(OUT_PATH, 0o775) + + LOG_DIR = OUT_PATH + tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") + + # write model desc to tensorboard + tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0) + + try: + main(args) + except KeyboardInterrupt: + remove_experiment_folder(OUT_PATH) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(OUT_PATH) + traceback.print_exc() + sys.exit(1) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json new file mode 100644 index 00000000..f7e5d99f --- /dev/null +++ b/TTS/vocoder/configs/wavernn_config.json @@ -0,0 +1,95 @@ +{ + "model": "wavernn", + "run_name": "wavernn_test", + "run_description": "wavernn_test training", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length + "seq_len": 1280, // has to be devideable by hop_length + "mode": "mold", // mold [string], gauss [string], bits [int] + "mulaw": false, // apply mulaw if mode is bits + "padding": 2, // pad the input for resnet to see wider input length + + // DATASET + "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files + "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant) + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "epochs": 10000, // total number of epochs to train. + "warmup_steps": 10, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, // early testing only wastes computation time. + + // OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [200000, 400000, 600000] + }, + "lr": 1e-4, // initial learning rate + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 50, // number of samples for testing + + // PATHS + "output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/" +} + diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index be60c13a..a5365686 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -23,8 +23,12 @@ def load_wav_data(data_path, eval_split_size): def load_wav_feat_data(data_path, feat_path, eval_split_size): - wav_paths = sorted(find_wav_files(data_path)) - feat_paths = sorted(find_feat_files(feat_path)) + wav_paths = find_wav_files(data_path) + feat_paths = find_feat_files(feat_path) + + wav_paths.sort(key=lambda x: Path(x).stem) + feat_paths.sort(key=lambda x: Path(x).stem) + assert len(wav_paths) == len(feat_paths) for wav, feat in zip(wav_paths, feat_paths): wav_name = Path(wav).stem diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 89dc68fb..365d0e11 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -41,6 +41,26 @@ def to_camel(text): text = text.capitalize() return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) +def setup_wavernn(c): + print(" > Model: {}".format(c.model)) + MyModel = importlib.import_module('TTS.vocoder.models.wavernn') + MyModel = getattr(MyModel, "WaveRNN") + model = MyModel( + rnn_dims=512, + fc_dims=512, + mode=c.mode, + mulaw=c.mulaw, + pad=c.padding, + use_aux_net=c.use_aux_net, + use_upsample_net=c.use_upsample_net, + upsample_factors=c.upsample_factors, + feat_dims=80, + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=c.audio['hop_length'], + sample_rate=c.audio['sample_rate']) + return model def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) From 9c3c7ce2f8452ee835b2b13162c008278406fe33 Mon Sep 17 00:00:00 2001 From: Alex K Date: Thu, 15 Oct 2020 19:15:53 +0200 Subject: [PATCH 24/98] wavernn stuff... --- TTS/vocoder/datasets/wavernn_dataset.py | 96 +++++ TTS/vocoder/models/wavernn.py | 485 ++++++++++++++++++++++++ TTS/vocoder/utils/distribution.py | 155 ++++++++ 3 files changed, 736 insertions(+) create mode 100644 TTS/vocoder/datasets/wavernn_dataset.py create mode 100644 TTS/vocoder/models/wavernn.py create mode 100644 TTS/vocoder/utils/distribution.py diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py new file mode 100644 index 00000000..b5a7fdad --- /dev/null +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -0,0 +1,96 @@ +import os +import glob +import torch +import numpy as np +from torch.utils.data import Dataset + + +class WaveRNNDataset(Dataset): + """ + WaveRNN Dataset searchs for all the wav files under root path + and converts them to acoustic features on the fly. + """ + + def __init__( + self, + ap, + items, + seq_len, + hop_len, + pad, + mode, + is_training=True, + return_segments=True, + use_cache=False, + verbose=False, + ): + + self.ap = ap + self.item_list = items + self.seq_len = seq_len + self.hop_len = hop_len + self.pad = pad + self.mode = mode + self.is_training = is_training + self.return_segments = return_segments + self.use_cache = use_cache + self.verbose = verbose + + # wav_files = [f"{self.path}wavs/{file}.wav" for file in self.metadata] + # with Pool(4) as pool: + # self.wav_cache = pool.map(self.ap.load_wav, wav_files) + + def __len__(self): + return len(self.item_list) + + def __getitem__(self, index): + item = self.load_item(index) + return item + + def load_item(self, index): + wavpath, feat_path = self.item_list[index] + m = np.load(feat_path.replace("/quant/", "/mel/")) + # x = self.wav_cache[index] + if 5 > m.shape[-1]: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + feat_path = self.item_list[index] + m = np.load(feat_path.replace("/quant/", "/mel/")) + if self.mode in ["gauss", "mold"]: + x = self.ap.load_wav(wavpath) + elif isinstance(self.mode, int): + x = np.load(feat_path.replace("/mel/", "/quant/")) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + return m, x + + def collate(self, batch): + mel_win = self.seq_len // self.hop_len + 2 * self.pad + max_offsets = [x[0].shape[-1] - (mel_win + 2 * self.pad) for x in batch] + mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] + sig_offsets = [(offset + self.pad) * self.hop_len for offset in mel_offsets] + + mels = [ + x[0][:, mel_offsets[i] : mel_offsets[i] + mel_win] + for i, x in enumerate(batch) + ] + + coarse = [ + x[1][sig_offsets[i] : sig_offsets[i] + self.seq_len + 1] + for i, x in enumerate(batch) + ] + + mels = np.stack(mels).astype(np.float32) + if self.mode in ["gauss", "mold"]: + coarse = np.stack(coarse).astype(np.float32) + coarse = torch.FloatTensor(coarse) + x_input = coarse[:, : self.seq_len] + elif isinstance(self.mode, int): + coarse = np.stack(coarse).astype(np.int64) + coarse = torch.LongTensor(coarse) + x_input = ( + 2 * coarse[:, : self.seq_len].float() / (2 ** self.mode - 1.0) - 1.0 + ) + y_coarse = coarse[:, 1:] + mels = torch.FloatTensor(mels) + return x_input, mels, y_coarse diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py new file mode 100644 index 00000000..e1c4365f --- /dev/null +++ b/TTS/vocoder/models/wavernn.py @@ -0,0 +1,485 @@ +import sys +import torch +import torch.nn as nn +import numpy as np +import torch.nn.functional as F +import time + +# fix this +from TTS.utils.audio import AudioProcessor as ap +from TTS.vocoder.utils.distribution import ( + sample_from_gaussian, + sample_from_discretized_mix_logistic, +) + + +def stream(string, variables): + sys.stdout.write(f"\r{string}" % variables) + + +class ResBlock(nn.Module): + def __init__(self, dims): + super().__init__() + self.conv1 = nn.Conv1d(dims, dims, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d(dims, dims, kernel_size=1, bias=False) + self.batch_norm1 = nn.BatchNorm1d(dims) + self.batch_norm2 = nn.BatchNorm1d(dims) + + def forward(self, x): + residual = x + x = self.conv1(x) + x = self.batch_norm1(x) + x = F.relu(x) + x = self.conv2(x) + x = self.batch_norm2(x) + return x + residual + + +class MelResNet(nn.Module): + def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): + super().__init__() + k_size = pad * 2 + 1 + self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False) + self.batch_norm = nn.BatchNorm1d(compute_dims) + self.layers = nn.ModuleList() + for i in range(res_blocks): + self.layers.append(ResBlock(compute_dims)) + self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) + + def forward(self, x): + x = self.conv_in(x) + x = self.batch_norm(x) + x = F.relu(x) + for f in self.layers: + x = f(x) + x = self.conv_out(x) + return x + + +class Stretch2d(nn.Module): + def __init__(self, x_scale, y_scale): + super().__init__() + self.x_scale = x_scale + self.y_scale = y_scale + + def forward(self, x): + b, c, h, w = x.size() + x = x.unsqueeze(-1).unsqueeze(3) + x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale) + return x.view(b, c, h * self.y_scale, w * self.x_scale) + + +class UpsampleNetwork(nn.Module): + def __init__( + self, + feat_dims, + upsample_scales, + compute_dims, + res_blocks, + res_out_dims, + pad, + use_aux_net, + ): + super().__init__() + self.total_scale = np.cumproduct(upsample_scales)[-1] + self.indent = pad * self.total_scale + self.use_aux_net = use_aux_net + if use_aux_net: + self.resnet = MelResNet( + res_blocks, feat_dims, compute_dims, res_out_dims, pad + ) + self.resnet_stretch = Stretch2d(self.total_scale, 1) + self.up_layers = nn.ModuleList() + for scale in upsample_scales: + k_size = (1, scale * 2 + 1) + padding = (0, scale) + stretch = Stretch2d(scale, 1) + conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False) + conv.weight.data.fill_(1.0 / k_size[1]) + self.up_layers.append(stretch) + self.up_layers.append(conv) + + def forward(self, m): + if self.use_aux_net: + aux = self.resnet(m).unsqueeze(1) + aux = self.resnet_stretch(aux) + aux = aux.squeeze(1) + aux = aux.transpose(1, 2) + else: + aux = None + m = m.unsqueeze(1) + for f in self.up_layers: + m = f(m) + m = m.squeeze(1)[:, :, self.indent : -self.indent] + return m.transpose(1, 2), aux + + +class Upsample(nn.Module): + def __init__( + self, scale, pad, res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net + ): + super().__init__() + self.scale = scale + self.pad = pad + self.indent = pad * scale + self.use_aux_net = use_aux_net + self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad) + + def forward(self, m): + if self.use_aux_net: + aux = self.resnet(m) + aux = torch.nn.functional.interpolate( + aux, scale_factor=self.scale, mode="linear", align_corners=True + ) + aux = aux.transpose(1, 2) + else: + aux = None + m = torch.nn.functional.interpolate( + m, scale_factor=self.scale, mode="linear", align_corners=True + ) + m = m[:, :, self.indent : -self.indent] + m = m * 0.045 # empirically found + + return m.transpose(1, 2), aux + + +class WaveRNN(nn.Module): + def __init__( + self, + rnn_dims, + fc_dims, + mode, + mulaw, + pad, + use_aux_net, + use_upsample_net, + upsample_factors, + feat_dims, + compute_dims, + res_out_dims, + res_blocks, + hop_length, + sample_rate, + ): + super().__init__() + self.mode = mode + self.mulaw = mulaw + self.pad = pad + self.use_upsample_net = use_upsample_net + self.use_aux_net = use_aux_net + if isinstance(self.mode, int): + self.n_classes = 2 ** self.mode + elif self.mode == "mold": + self.n_classes = 3 * 10 + elif self.mode == "gauss": + self.n_classes = 2 + else: + raise RuntimeError(" > Unknown training mode") + + self.rnn_dims = rnn_dims + self.aux_dims = res_out_dims // 4 + self.hop_length = hop_length + self.sample_rate = sample_rate + + if self.use_upsample_net: + assert ( + np.cumproduct(upsample_factors)[-1] == self.hop_length + ), " [!] upsample scales needs to be equal to hop_length" + self.upsample = UpsampleNetwork( + feat_dims, + upsample_factors, + compute_dims, + res_blocks, + res_out_dims, + pad, + use_aux_net, + ) + else: + self.upsample = Upsample( + hop_length, + pad, + res_blocks, + feat_dims, + compute_dims, + res_out_dims, + use_aux_net, + ) + if self.use_aux_net: + self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims) + self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True) + self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) + self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) + self.fc3 = nn.Linear(fc_dims, self.n_classes) + else: + self.I = nn.Linear(feat_dims + 1, rnn_dims) + self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) + self.fc1 = nn.Linear(rnn_dims, fc_dims) + self.fc2 = nn.Linear(fc_dims, fc_dims) + self.fc3 = nn.Linear(fc_dims, self.n_classes) + + def forward(self, x, mels): + bsize = x.size(0) + h1 = torch.zeros(1, bsize, self.rnn_dims).cuda() + h2 = torch.zeros(1, bsize, self.rnn_dims).cuda() + mels, aux = self.upsample(mels) + + if self.use_aux_net: + aux_idx = [self.aux_dims * i for i in range(5)] + a1 = aux[:, :, aux_idx[0] : aux_idx[1]] + a2 = aux[:, :, aux_idx[1] : aux_idx[2]] + a3 = aux[:, :, aux_idx[2] : aux_idx[3]] + a4 = aux[:, :, aux_idx[3] : aux_idx[4]] + + x = ( + torch.cat([x.unsqueeze(-1), mels, a1], dim=2) + if self.use_aux_net + else torch.cat([x.unsqueeze(-1), mels], dim=2) + ) + x = self.I(x) + res = x + self.rnn1.flatten_parameters() + x, _ = self.rnn1(x, h1) + + x = x + res + res = x + x = torch.cat([x, a2], dim=2) if self.use_aux_net else x + self.rnn2.flatten_parameters() + x, _ = self.rnn2(x, h2) + + x = x + res + x = torch.cat([x, a3], dim=2) if self.use_aux_net else x + x = F.relu(self.fc1(x)) + + x = torch.cat([x, a4], dim=2) if self.use_aux_net else x + x = F.relu(self.fc2(x)) + return self.fc3(x) + + def generate(self, mels, batched, target, overlap): + + self.eval() + output = [] + start = time.time() + rnn1 = self.get_gru_cell(self.rnn1) + rnn2 = self.get_gru_cell(self.rnn2) + + with torch.no_grad(): + + mels = torch.FloatTensor(mels).cuda().unsqueeze(0) + wave_len = (mels.size(-1) - 1) * self.hop_length + mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both") + mels, aux = self.upsample(mels.transpose(1, 2)) + + if batched: + mels = self.fold_with_overlap(mels, target, overlap) + if aux is not None: + aux = self.fold_with_overlap(aux, target, overlap) + + b_size, seq_len, _ = mels.size() + + h1 = torch.zeros(b_size, self.rnn_dims).cuda() + h2 = torch.zeros(b_size, self.rnn_dims).cuda() + x = torch.zeros(b_size, 1).cuda() + + if self.use_aux_net: + d = self.aux_dims + aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] + + for i in range(seq_len): + + m_t = mels[:, i, :] + + if self.use_aux_net: + a1_t, a2_t, a3_t, a4_t = (a[:, i, :] for a in aux_split) + + x = ( + torch.cat([x, m_t, a1_t], dim=1) + if self.use_aux_net + else torch.cat([x, m_t], dim=1) + ) + x = self.I(x) + h1 = rnn1(x, h1) + + x = x + h1 + inp = torch.cat([x, a2_t], dim=1) if self.use_aux_net else x + h2 = rnn2(inp, h2) + + x = x + h2 + x = torch.cat([x, a3_t], dim=1) if self.use_aux_net else x + x = F.relu(self.fc1(x)) + + x = torch.cat([x, a4_t], dim=1) if self.use_aux_net else x + x = F.relu(self.fc2(x)) + + logits = self.fc3(x) + + if self.mode == "mold": + sample = sample_from_discretized_mix_logistic( + logits.unsqueeze(0).transpose(1, 2) + ) + output.append(sample.view(-1)) + x = sample.transpose(0, 1).cuda() + elif self.mode == "gauss": + sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) + output.append(sample.view(-1)) + x = sample.transpose(0, 1).cuda() + elif isinstance(self.mode, int): + posterior = F.softmax(logits, dim=1) + distrib = torch.distributions.Categorical(posterior) + + sample = 2 * distrib.sample().float() / (self.n_classes - 1.0) - 1.0 + output.append(sample) + x = sample.unsqueeze(-1) + else: + raise RuntimeError("Unknown model mode value - ", self.mode) + + if i % 100 == 0: + self.gen_display(i, seq_len, b_size, start) + + output = torch.stack(output).transpose(0, 1) + output = output.cpu().numpy() + output = output.astype(np.float64) + + if batched: + output = self.xfade_and_unfold(output, target, overlap) + else: + output = output[0] + + if self.mulaw and isinstance(self.mode, int): + output = ap.mulaw_decode(output, self.mode) + + # Fade-out at the end to avoid signal cutting out suddenly + fade_out = np.linspace(1, 0, 20 * self.hop_length) + output = output[:wave_len] + output[-20 * self.hop_length :] *= fade_out + + self.train() + return output + + def gen_display(self, i, seq_len, b_size, start): + gen_rate = (i + 1) / (time.time() - start) * b_size / 1000 + realtime_ratio = gen_rate * 1000 / self.sample_rate + stream( + "%i/%i -- batch_size: %i -- gen_rate: %.1f kHz -- x_realtime: %.1f ", + (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), + ) + + def get_gru_cell(self, gru): + gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) + gru_cell.weight_hh.data = gru.weight_hh_l0.data + gru_cell.weight_ih.data = gru.weight_ih_l0.data + gru_cell.bias_hh.data = gru.bias_hh_l0.data + gru_cell.bias_ih.data = gru.bias_ih_l0.data + return gru_cell + + def pad_tensor(self, x, pad, side="both"): + # NB - this is just a quick method i need right now + # i.e., it won't generalise to other shapes/dims + b, t, c = x.size() + total = t + 2 * pad if side == "both" else t + pad + padded = torch.zeros(b, total, c).cuda() + if side == "before" or side == "both": + padded[:, pad : pad + t, :] = x + elif side == "after": + padded[:, :t, :] = x + return padded + + def fold_with_overlap(self, x, target, overlap): + + """Fold the tensor with overlap for quick batched inference. + Overlap will be used for crossfading in xfade_and_unfold() + Args: + x (tensor) : Upsampled conditioning features. + shape=(1, timesteps, features) + target (int) : Target timesteps for each index of batch + overlap (int) : Timesteps for both xfade and rnn warmup + Return: + (tensor) : shape=(num_folds, target + 2 * overlap, features) + Details: + x = [[h1, h2, ... hn]] + Where each h is a vector of conditioning features + Eg: target=2, overlap=1 with x.size(1)=10 + folded = [[h1, h2, h3, h4], + [h4, h5, h6, h7], + [h7, h8, h9, h10]] + """ + + _, total_len, features = x.size() + + # Calculate variables needed + num_folds = (total_len - overlap) // (target + overlap) + extended_len = num_folds * (overlap + target) + overlap + remaining = total_len - extended_len + + # Pad if some time steps poking out + if remaining != 0: + num_folds += 1 + padding = target + 2 * overlap - remaining + x = self.pad_tensor(x, padding, side="after") + + folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + + # Get the values for the folded tensor + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + folded[i] = x[:, start:end, :] + + return folded + + def xfade_and_unfold(self, y, target, overlap): + + """Applies a crossfade and unfolds into a 1d array. + Args: + y (ndarry) : Batched sequences of audio samples + shape=(num_folds, target + 2 * overlap) + dtype=np.float64 + overlap (int) : Timesteps for both xfade and rnn warmup + Return: + (ndarry) : audio samples in a 1d array + shape=(total_len) + dtype=np.float64 + Details: + y = [[seq1], + [seq2], + [seq3]] + Apply a gain envelope at both ends of the sequences + y = [[seq1_in, seq1_target, seq1_out], + [seq2_in, seq2_target, seq2_out], + [seq3_in, seq3_target, seq3_out]] + Stagger and add up the groups of samples: + [seq1_in, seq1_target, (seq1_out + seq2_in), seq2_target, ...] + """ + + num_folds, length = y.shape + target = length - 2 * overlap + total_len = num_folds * (target + overlap) + overlap + + # Need some silence for the rnn warmup + silence_len = overlap // 2 + fade_len = overlap - silence_len + silence = np.zeros((silence_len), dtype=np.float64) + + # Equal power crossfade + t = np.linspace(-1, 1, fade_len, dtype=np.float64) + fade_in = np.sqrt(0.5 * (1 + t)) + fade_out = np.sqrt(0.5 * (1 - t)) + + # Concat the silence to the fades + fade_in = np.concatenate([silence, fade_in]) + fade_out = np.concatenate([fade_out, silence]) + + # Apply the gain to the overlap samples + y[:, :overlap] *= fade_in + y[:, -overlap:] *= fade_out + + unfolded = np.zeros((total_len), dtype=np.float64) + + # Loop to add up all the samples + for i in range(num_folds): + start = i * (target + overlap) + end = start + target + 2 * overlap + unfolded[start:end] += y[i] + + return unfolded diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py new file mode 100644 index 00000000..bfcbdd3f --- /dev/null +++ b/TTS/vocoder/utils/distribution.py @@ -0,0 +1,155 @@ +import numpy as np +import math +import torch +from torch.distributions.normal import Normal +import torch.nn.functional as F + + +def gaussian_loss(y_hat, y, log_std_min=-7.0): + assert y_hat.dim() == 3 + assert y_hat.size(2) == 2 + mean = y_hat[:, :, :1] + log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) + # TODO: replace with pytorch dist + log_probs = -0.5 * (- math.log(2.0 * math.pi) - 2. * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std))) + return log_probs.squeeze().mean() + + +def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0): + assert y_hat.size(2) == 2 + mean = y_hat[:, :, :1] + log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) + dist = Normal(mean, torch.exp(log_std), ) + sample = dist.sample() + sample = torch.clamp(torch.clamp(sample, min=-scale_factor), max=scale_factor) + del dist + return sample + + +def log_sum_exp(x): + """ numerically stable log_sum_exp implementation that prevents overflow """ + # TF ordering + axis = len(x.size()) - 1 + m, _ = torch.max(x, dim=axis) + m2, _ = torch.max(x, dim=axis, keepdim=True) + return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis)) + + +# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py +def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, + log_scale_min=None, reduce=True): + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + y_hat = y_hat.permute(0,2,1) + assert y_hat.dim() == 3 + assert y_hat.size(1) % 3 == 0 + nr_mix = y_hat.size(1) // 3 + + # (B x T x C) + y_hat = y_hat.transpose(1, 2) + + # unpack parameters. (B, T, num_mixtures) x 3 + logit_probs = y_hat[:, :, :nr_mix] + means = y_hat[:, :, nr_mix:2 * nr_mix] + log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) + + # B x T x 1 -> B x T x num_mixtures + y = y.expand_as(means) + + centered_y = y - means + inv_stdv = torch.exp(-log_scales) + plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) + cdf_plus = torch.sigmoid(plus_in) + min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) + cdf_min = torch.sigmoid(min_in) + + # log probability for edge case of 0 (before scaling) + # equivalent: torch.log(F.sigmoid(plus_in)) + log_cdf_plus = plus_in - F.softplus(plus_in) + + # log probability for edge case of 255 (before scaling) + # equivalent: (1 - F.sigmoid(min_in)).log() + log_one_minus_cdf_min = -F.softplus(min_in) + + # probability for all other cases + cdf_delta = cdf_plus - cdf_min + + mid_in = inv_stdv * centered_y + # log probability in the center of the bin, to be used in extreme cases + # (not actually used in our code) + log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) + + # tf equivalent + """ + log_probs = tf.where(x < -0.999, log_cdf_plus, + tf.where(x > 0.999, log_one_minus_cdf_min, + tf.where(cdf_delta > 1e-5, + tf.log(tf.maximum(cdf_delta, 1e-12)), + log_pdf_mid - np.log(127.5)))) + """ + # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value + # for num_classes=65536 case? 1e-7? not sure.. + inner_inner_cond = (cdf_delta > 1e-5).float() + + inner_inner_out = inner_inner_cond * \ + torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ + (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) + inner_cond = (y > 0.999).float() + inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out + cond = (y < -0.999).float() + log_probs = cond * log_cdf_plus + (1. - cond) * inner_out + + log_probs = log_probs + F.log_softmax(logit_probs, -1) + + if reduce: + return -torch.mean(log_sum_exp(log_probs)) + else: + return -log_sum_exp(log_probs).unsqueeze(-1) + + +def sample_from_discretized_mix_logistic(y, log_scale_min=None): + """ + Sample from discretized mixture of logistic distributions + Args: + y (Tensor): B x C x T + log_scale_min (float): Log scale minimum value + Returns: + Tensor: sample in range of [-1, 1]. + """ + if log_scale_min is None: + log_scale_min = float(np.log(1e-14)) + assert y.size(1) % 3 == 0 + nr_mix = y.size(1) // 3 + + # B x T x C + y = y.transpose(1, 2) + logit_probs = y[:, :, :nr_mix] + + # sample mixture indicator from softmax + temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) + temp = logit_probs.data - torch.log(- torch.log(temp)) + _, argmax = temp.max(dim=-1) + + # (B, T) -> (B, T, nr_mix) + one_hot = to_one_hot(argmax, nr_mix) + # select logistic parameters + means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) + log_scales = torch.clamp(torch.sum( + y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) + # sample from logistic & clip to interval + # we don't actually round to the nearest 8bit value when sampling + u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5) + x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) + + x = torch.clamp(torch.clamp(x, min=-1.), max=1.) + + return x + + +def to_one_hot(tensor, n, fill_with=1.): + # we perform one hot encore with respect to the last axis + one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() + if tensor.is_cuda: + one_hot = one_hot.cuda() + one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with) + return one_hot From e495e03ea11ba570dfef73e5581d84b8090a1672 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Fri, 16 Oct 2020 21:19:51 +0200 Subject: [PATCH 25/98] some minor changes to wavernn --- TTS/bin/train_wavernn_vocoder.py | 31 ++++--- TTS/vocoder/configs/wavernn_config.json | 8 +- TTS/vocoder/datasets/wavernn_dataset.py | 11 +-- TTS/vocoder/utils/generic_utils.py | 112 +++++++++++++----------- 4 files changed, 82 insertions(+), 80 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 2f77ab57..e2b8057e 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -13,17 +13,13 @@ import torch from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler - -from TTS.utils.audio import AudioProcessor from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.audio import AudioProcessor +from TTS.utils.radam import RAdam from TTS.utils.io import copy_config_file, load_config -from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -from TTS.utils.tensorboard_logger import TensorboardLogger -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss -from TTS.vocoder.utils.generic_utils import setup_wavernn from TTS.utils.training import setup_torch_training_env from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.generic_utils import ( KeepAverage, count_parameters, @@ -32,6 +28,10 @@ from TTS.utils.generic_utils import ( remove_experiment_folder, set_init_dict, ) +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss +from TTS.vocoder.utils.generic_utils import setup_wavernn from TTS.vocoder.utils.io import save_best_model, save_checkpoint @@ -105,9 +105,7 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): # MODEL TRAINING # ################## y_hat = model(x, m) - y_hat_vis = y_hat # for visualization - # y_hat = y_hat.transpose(1, 2) if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: @@ -200,8 +198,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) # compute spectrograms figures = { - "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), - "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + "prediction": plot_spectrogram(predict_mel.T), + "ground_truth": plot_spectrogram(ground_mel.T), } tb_logger.tb_train_figures(global_step, figures) end_time = time.time() @@ -237,6 +235,7 @@ def evaluate(model, criterion, ap, global_step, epoch): global_step += 1 y_hat = model(x, m) + y_hat_viz = y_hat # for vizualization if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: @@ -266,7 +265,7 @@ def evaluate(model, criterion, ap, global_step, epoch): if epoch > CONFIG.test_delay_epochs: # synthesize a full voice - wav_path = eval_data[random.randrange(0, len(eval_data))][0] + wav_path = train_data[random.randrange(0, len(train_data))][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) sample_wav = model.generate( @@ -283,8 +282,8 @@ def evaluate(model, criterion, ap, global_step, epoch): ) # compute spectrograms figures = { - "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), - "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + "eval/prediction": plot_spectrogram(predict_mel.T), + "eval/ground_truth": plot_spectrogram(ground_mel.T), } tb_logger.tb_eval_figures(global_step, figures) @@ -303,7 +302,6 @@ def main(args): # pylint: disable=redefined-outer-name eval_data, train_data = load_wav_feat_data( CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size ) - eval_data, train_data = eval_data, train_data else: eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) @@ -326,7 +324,8 @@ def main(args): # pylint: disable=redefined-outer-name if isinstance(CONFIG.mode, int): criterion.cuda() - optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + optimizer = RAdam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + scheduler = None if "lr_scheduler" in CONFIG: scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index f7e5d99f..67503aef 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,5 +1,4 @@ { - "model": "wavernn", "run_name": "wavernn_test", "run_description": "wavernn_test training", @@ -54,13 +53,14 @@ "mode": "mold", // mold [string], gauss [string], bits [int] "mulaw": false, // apply mulaw if mode is bits "padding": 2, // pad the input for resnet to see wider input length - + // DATASET + "use_gta": true, // use computed gta features from the tts model "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files - "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant) + "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing computed features .npy (mels / quant) // TRAINING - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. "epochs": 10000, // total number of epochs to train. "warmup_steps": 10, diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index b5a7fdad..8faf5f3c 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -7,8 +7,7 @@ from torch.utils.data import Dataset class WaveRNNDataset(Dataset): """ - WaveRNN Dataset searchs for all the wav files under root path - and converts them to acoustic features on the fly. + WaveRNN Dataset searchs for all the wav files under root path. """ def __init__( @@ -20,8 +19,6 @@ class WaveRNNDataset(Dataset): pad, mode, is_training=True, - return_segments=True, - use_cache=False, verbose=False, ): @@ -32,14 +29,8 @@ class WaveRNNDataset(Dataset): self.pad = pad self.mode = mode self.is_training = is_training - self.return_segments = return_segments - self.use_cache = use_cache self.verbose = verbose - # wav_files = [f"{self.path}wavs/{file}.wav" for file in self.metadata] - # with Pool(4) as pool: - # self.wav_cache = pool.map(self.ap.load_wav, wav_files) - def __len__(self): return len(self.item_list) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 365d0e11..c73c5248 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -39,11 +39,12 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): def to_camel(text): text = text.capitalize() - return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) + return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + def setup_wavernn(c): - print(" > Model: {}".format(c.model)) - MyModel = importlib.import_module('TTS.vocoder.models.wavernn') + print(" > Model: WaveRNN") + MyModel = importlib.import_module("TTS.vocoder.models.wavernn") MyModel = getattr(MyModel, "WaveRNN") model = MyModel( rnn_dims=512, @@ -58,98 +59,109 @@ def setup_wavernn(c): compute_dims=128, res_out_dims=128, res_blocks=10, - hop_length=c.audio['hop_length'], - sample_rate=c.audio['sample_rate']) + hop_length=c.audio["hop_length"], + sample_rate=c.audio["sample_rate"], + ) return model + def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module('TTS.vocoder.models.' + - c.generator_model.lower()) + MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in 'melgan_generator': + if c.generator_model in "melgan_generator": model = MyModel( - in_channels=c.audio['num_mels'], + in_channels=c.audio["num_mels"], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params['upsample_factors'], + upsample_factors=c.generator_model_params["upsample_factors"], res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'melgan_fb_generator': + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + if c.generator_model in "melgan_fb_generator": pass - if c.generator_model in 'multiband_melgan_generator': + if c.generator_model in "multiband_melgan_generator": model = MyModel( - in_channels=c.audio['num_mels'], + in_channels=c.audio["num_mels"], out_channels=4, proj_kernel=7, base_channels=384, - upsample_factors=c.generator_model_params['upsample_factors'], + upsample_factors=c.generator_model_params["upsample_factors"], res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'fullband_melgan_generator': + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + if c.generator_model in "fullband_melgan_generator": model = MyModel( - in_channels=c.audio['num_mels'], + in_channels=c.audio["num_mels"], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params['upsample_factors'], + upsample_factors=c.generator_model_params["upsample_factors"], res_kernel=3, - num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'parallel_wavegan_generator': + num_res_blocks=c.generator_model_params["num_res_blocks"], + ) + if c.generator_model in "parallel_wavegan_generator": model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_res_blocks=c.generator_model_params['num_res_blocks'], - stacks=c.generator_model_params['stacks'], + num_res_blocks=c.generator_model_params["num_res_blocks"], + stacks=c.generator_model_params["stacks"], res_channels=64, gate_channels=128, skip_channels=64, - aux_channels=c.audio['num_mels'], + aux_channels=c.audio["num_mels"], dropout=0.0, bias=True, use_weight_norm=True, - upsample_factors=c.generator_model_params['upsample_factors']) + upsample_factors=c.generator_model_params["upsample_factors"], + ) return model def setup_discriminator(c): print(" > Discriminator Model: {}".format(c.discriminator_model)) - if 'parallel_wavegan' in c.discriminator_model: + if "parallel_wavegan" in c.discriminator_model: MyModel = importlib.import_module( - 'TTS.vocoder.models.parallel_wavegan_discriminator') + "TTS.vocoder.models.parallel_wavegan_discriminator" + ) else: - MyModel = importlib.import_module('TTS.vocoder.models.' + - c.discriminator_model.lower()) + MyModel = importlib.import_module( + "TTS.vocoder.models." + c.discriminator_model.lower() + ) MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in 'random_window_discriminator': + if c.discriminator_model in "random_window_discriminator": model = MyModel( - cond_channels=c.audio['num_mels'], - hop_length=c.audio['hop_length'], - uncond_disc_donwsample_factors=c. - discriminator_model_params['uncond_disc_donwsample_factors'], - cond_disc_downsample_factors=c. - discriminator_model_params['cond_disc_downsample_factors'], - cond_disc_out_channels=c. - discriminator_model_params['cond_disc_out_channels'], - window_sizes=c.discriminator_model_params['window_sizes']) - if c.discriminator_model in 'melgan_multiscale_discriminator': + cond_channels=c.audio["num_mels"], + hop_length=c.audio["hop_length"], + uncond_disc_donwsample_factors=c.discriminator_model_params[ + "uncond_disc_donwsample_factors" + ], + cond_disc_downsample_factors=c.discriminator_model_params[ + "cond_disc_downsample_factors" + ], + cond_disc_out_channels=c.discriminator_model_params[ + "cond_disc_out_channels" + ], + window_sizes=c.discriminator_model_params["window_sizes"], + ) + if c.discriminator_model in "melgan_multiscale_discriminator": model = MyModel( in_channels=1, out_channels=1, kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params['base_channels'], - max_channels=c.discriminator_model_params['max_channels'], - downsample_factors=c. - discriminator_model_params['downsample_factors']) - if c.discriminator_model == 'residual_parallel_wavegan_discriminator': + base_channels=c.discriminator_model_params["base_channels"], + max_channels=c.discriminator_model_params["max_channels"], + downsample_factors=c.discriminator_model_params["downsample_factors"], + ) + if c.discriminator_model == "residual_parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params['num_layers'], - stacks=c.discriminator_model_params['stacks'], + num_layers=c.discriminator_model_params["num_layers"], + stacks=c.discriminator_model_params["stacks"], res_channels=64, gate_channels=128, skip_channels=64, @@ -158,17 +170,17 @@ def setup_discriminator(c): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, ) - if c.discriminator_model == 'parallel_wavegan_discriminator': + if c.discriminator_model == "parallel_wavegan_discriminator": model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params['num_layers'], + num_layers=c.discriminator_model_params["num_layers"], conv_channels=64, dilation_factor=1, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, - bias=True + bias=True, ) return model From 878b7c373ef5e7b590a116d71340b16c7a93fe39 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Mon, 19 Oct 2020 14:37:30 +0200 Subject: [PATCH 26/98] added feature preprocessing if not set in config --- TTS/bin/train_wavernn_vocoder.py | 64 ++++++++++++++++--------- TTS/vocoder/configs/wavernn_config.json | 11 ++--- TTS/vocoder/datasets/preprocess.py | 25 +++++++++- TTS/vocoder/datasets/wavernn_dataset.py | 1 + 4 files changed, 71 insertions(+), 30 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index e2b8057e..533fe0ce 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -29,7 +29,12 @@ from TTS.utils.generic_utils import ( set_init_dict, ) from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset -from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.datasets.preprocess import ( + load_wav_data, + find_feat_files, + load_wav_feat_data, + preprocess_wav_files, +) from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss from TTS.vocoder.utils.generic_utils import setup_wavernn from TTS.vocoder.utils.io import save_best_model, save_checkpoint @@ -192,15 +197,17 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) predict_mel = ap.melspectrogram(sample_wav) - # Sample audio - tb_logger.tb_train_audios( - global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] - ) # compute spectrograms figures = { - "prediction": plot_spectrogram(predict_mel.T), - "ground_truth": plot_spectrogram(ground_mel.T), + "train/ground_truth": plot_spectrogram(ground_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T), } + + # Sample audio + tb_logger.tb_train_audios( + global_step, {"train/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + tb_logger.tb_train_figures(global_step, figures) end_time = time.time() @@ -235,7 +242,6 @@ def evaluate(model, criterion, ap, global_step, epoch): global_step += 1 y_hat = model(x, m) - y_hat_viz = y_hat # for vizualization if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: @@ -263,11 +269,11 @@ def evaluate(model, criterion, ap, global_step, epoch): if CONFIG.print_eval: c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - if epoch > CONFIG.test_delay_epochs: - # synthesize a full voice - wav_path = train_data[random.randrange(0, len(train_data))][0] + if epoch % CONFIG.test_every_epochs == 0: + # synthesize a part of data + wav_path = eval_data[random.randrange(0, len(eval_data))][0] wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav) + ground_mel = ap.melspectrogram(wav[:22000]) sample_wav = model.generate( ground_mel, CONFIG.batched, @@ -276,15 +282,17 @@ def evaluate(model, criterion, ap, global_step, epoch): ) predict_mel = ap.melspectrogram(sample_wav) + # compute spectrograms + figures = { + "eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T), + } + # Sample audio tb_logger.tb_eval_audios( global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] ) - # compute spectrograms - figures = { - "eval/prediction": plot_spectrogram(predict_mel.T), - "eval/ground_truth": plot_spectrogram(ground_mel.T), - } + tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -296,6 +304,9 @@ def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global train_data, eval_data + # setup audio processor + ap = AudioProcessor(**CONFIG.audio) + print(f" > Loading wavs from: {CONFIG.data_path}") if CONFIG.feature_path is not None: print(f" > Loading features from: {CONFIG.feature_path}") @@ -303,11 +314,20 @@ def main(args): # pylint: disable=redefined-outer-name CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size ) else: - eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) - - # setup audio processor - ap = AudioProcessor(**CONFIG.audio) - + mel_feat_path = os.path.join(OUT_PATH, "mel") + feat_data = find_feat_files(mel_feat_path) + if feat_data: + print(f" > Loading features from: {mel_feat_path}") + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size + ) + else: + print(f" > No feature data found. Preprocessing...") + # preprocessing feature data from given wav files + preprocess_wav_files(OUT_PATH, CONFIG, ap) + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size + ) # setup model model_wavernn = setup_wavernn(CONFIG) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 67503aef..8e6a8c32 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -55,18 +55,17 @@ "padding": 2, // pad the input for resnet to see wider input length // DATASET - "use_gta": true, // use computed gta features from the tts model - "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files - "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing computed features .npy (mels / quant) + //"use_gta": true, // use computed gta features from the tts model + "data_path": "path/to/wav/files", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them // TRAINING "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. "epochs": 10000, // total number of epochs to train. - "warmup_steps": 10, // VALIDATION "run_eval": true, - "test_delay_epochs": 10, // early testing only wastes computation time. + "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) // OPTIMIZER "grad_clip": 4, // apply gradient clipping if > 0 @@ -90,6 +89,6 @@ "eval_split_size": 50, // number of samples for testing // PATHS - "output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/" + "output_path": "output/training/path" } diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index a5365686..afea45fd 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -1,17 +1,38 @@ import glob import os from pathlib import Path +from tqdm import tqdm import numpy as np +def preprocess_wav_files(out_path, config, ap): + os.makedirs(os.path.join(out_path, "quant"), exist_ok=True) + os.makedirs(os.path.join(out_path, "mel"), exist_ok=True) + wav_files = find_wav_files(config.data_path) + for path in tqdm(wav_files): + wav_name = Path(path).stem + quant_path = os.path.join(out_path, "quant", wav_name + ".npy") + mel_path = os.path.join(out_path, "mel", wav_name + ".npy") + y = ap.load_wav(path) + mel = ap.melspectrogram(y) + np.save(mel_path, mel) + if isinstance(config.mode, int): + quant = ( + ap.mulaw_encode(y, qc=config.mode) + if config.mulaw + else ap.quantize(y, bits=config.mode) + ) + np.save(quant_path, quant) + + def find_wav_files(data_path): - wav_paths = glob.glob(os.path.join(data_path, '**', '*.wav'), recursive=True) + wav_paths = glob.glob(os.path.join(data_path, "**", "*.wav"), recursive=True) return wav_paths def find_feat_files(data_path): - feat_paths = glob.glob(os.path.join(data_path, '**', '*.npy'), recursive=True) + feat_paths = glob.glob(os.path.join(data_path, "**", "*.npy"), recursive=True) return feat_paths diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 8faf5f3c..1b0a8077 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -48,6 +48,7 @@ class WaveRNNDataset(Dataset): feat_path = self.item_list[index] m = np.load(feat_path.replace("/quant/", "/mel/")) if self.mode in ["gauss", "mold"]: + # x = np.load(feat_path.replace("/mel/", "/quant/")) x = self.ap.load_wav(wavpath) elif isinstance(self.mode, int): x = np.load(feat_path.replace("/mel/", "/quant/")) From e8294cb9db2dc494270ddf8c06bf994987a6e65d Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Mon, 19 Oct 2020 15:38:32 +0200 Subject: [PATCH 27/98] fixing pylint errors --- TTS/bin/train_wavernn_vocoder.py | 9 ++-- TTS/vocoder/datasets/wavernn_dataset.py | 4 +- TTS/vocoder/models/wavernn.py | 10 ++-- TTS/vocoder/utils/distribution.py | 72 ++++++++++++++----------- 4 files changed, 51 insertions(+), 44 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 533fe0ce..78984510 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -1,8 +1,5 @@ import argparse -import math import os -import pickle -import shutil import sys import traceback import time @@ -11,7 +8,8 @@ import random import torch from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler + +# from torch.utils.data.distributed import DistributedSampler from TTS.tts.utils.visual import plot_spectrogram from TTS.utils.audio import AudioProcessor @@ -30,7 +28,6 @@ from TTS.utils.generic_utils import ( ) from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( - load_wav_data, find_feat_files, load_wav_feat_data, preprocess_wav_files, @@ -322,7 +319,7 @@ def main(args): # pylint: disable=redefined-outer-name CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size ) else: - print(f" > No feature data found. Preprocessing...") + print(" > No feature data found. Preprocessing...") # preprocessing feature data from given wav files preprocess_wav_files(OUT_PATH, CONFIG, ap) eval_data, train_data = load_wav_feat_data( diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 1b0a8077..5d5b9f15 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,5 +1,3 @@ -import os -import glob import torch import numpy as np from torch.utils.data import Dataset @@ -42,7 +40,7 @@ class WaveRNNDataset(Dataset): wavpath, feat_path = self.item_list[index] m = np.load(feat_path.replace("/quant/", "/mel/")) # x = self.wav_cache[index] - if 5 > m.shape[-1]: + if m.shape[-1] < 5: print(" [!] Instance is too short! : {}".format(wavpath)) self.item_list[index] = self.item_list[index + 1] feat_path = self.item_list[index] diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index e1c4365f..9b637a6a 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -42,7 +42,7 @@ class MelResNet(nn.Module): self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() - for i in range(res_blocks): + for _ in range(res_blocks): self.layers.append(ResBlock(compute_dims)) self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) @@ -365,7 +365,8 @@ class WaveRNN(nn.Module): (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), ) - def get_gru_cell(self, gru): + @staticmethod + def get_gru_cell(gru): gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) gru_cell.weight_hh.data = gru.weight_hh_l0.data gru_cell.weight_ih.data = gru.weight_ih_l0.data @@ -373,13 +374,14 @@ class WaveRNN(nn.Module): gru_cell.bias_ih.data = gru.bias_ih_l0.data return gru_cell - def pad_tensor(self, x, pad, side="both"): + @staticmethod + def pad_tensor(x, pad, side="both"): # NB - this is just a quick method i need right now # i.e., it won't generalise to other shapes/dims b, t, c = x.size() total = t + 2 * pad if side == "both" else t + pad padded = torch.zeros(b, total, c).cuda() - if side == "before" or side == "both": + if side in ("before", "both"): padded[:, pad : pad + t, :] = x elif side == "after": padded[:, :t, :] = x diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index bfcbdd3f..705c14dc 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -11,7 +11,11 @@ def gaussian_loss(y_hat, y, log_std_min=-7.0): mean = y_hat[:, :, :1] log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) # TODO: replace with pytorch dist - log_probs = -0.5 * (- math.log(2.0 * math.pi) - 2. * log_std - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std))) + log_probs = -0.5 * ( + -math.log(2.0 * math.pi) + - 2.0 * log_std + - torch.pow(y - mean, 2) * torch.exp((-2.0 * log_std)) + ) return log_probs.squeeze().mean() @@ -19,7 +23,10 @@ def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0): assert y_hat.size(2) == 2 mean = y_hat[:, :, :1] log_std = torch.clamp(y_hat[:, :, 1:], min=log_std_min) - dist = Normal(mean, torch.exp(log_std), ) + dist = Normal( + mean, + torch.exp(log_std), + ) sample = dist.sample() sample = torch.clamp(torch.clamp(sample, min=-scale_factor), max=scale_factor) del dist @@ -36,11 +43,12 @@ def log_sum_exp(x): # It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py -def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, - log_scale_min=None, reduce=True): +def discretized_mix_logistic_loss( + y_hat, y, num_classes=65536, log_scale_min=None, reduce=True +): if log_scale_min is None: log_scale_min = float(np.log(1e-14)) - y_hat = y_hat.permute(0,2,1) + y_hat = y_hat.permute(0, 2, 1) assert y_hat.dim() == 3 assert y_hat.size(1) % 3 == 0 nr_mix = y_hat.size(1) // 3 @@ -50,17 +58,17 @@ def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, # unpack parameters. (B, T, num_mixtures) x 3 logit_probs = y_hat[:, :, :nr_mix] - means = y_hat[:, :, nr_mix:2 * nr_mix] - log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min) + means = y_hat[:, :, nr_mix : 2 * nr_mix] + log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min) # B x T x 1 -> B x T x num_mixtures y = y.expand_as(means) centered_y = y - means inv_stdv = torch.exp(-log_scales) - plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1)) + plus_in = inv_stdv * (centered_y + 1.0 / (num_classes - 1)) cdf_plus = torch.sigmoid(plus_in) - min_in = inv_stdv * (centered_y - 1. / (num_classes - 1)) + min_in = inv_stdv * (centered_y - 1.0 / (num_classes - 1)) cdf_min = torch.sigmoid(min_in) # log probability for edge case of 0 (before scaling) @@ -77,34 +85,35 @@ def discretized_mix_logistic_loss(y_hat, y, num_classes=65536, mid_in = inv_stdv * centered_y # log probability in the center of the bin, to be used in extreme cases # (not actually used in our code) - log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in) + log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in) # tf equivalent - """ - log_probs = tf.where(x < -0.999, log_cdf_plus, - tf.where(x > 0.999, log_one_minus_cdf_min, - tf.where(cdf_delta > 1e-5, - tf.log(tf.maximum(cdf_delta, 1e-12)), - log_pdf_mid - np.log(127.5)))) - """ + + # log_probs = tf.where(x < -0.999, log_cdf_plus, + # tf.where(x > 0.999, log_one_minus_cdf_min, + # tf.where(cdf_delta > 1e-5, + # tf.log(tf.maximum(cdf_delta, 1e-12)), + # log_pdf_mid - np.log(127.5)))) + # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value # for num_classes=65536 case? 1e-7? not sure.. inner_inner_cond = (cdf_delta > 1e-5).float() - inner_inner_out = inner_inner_cond * \ - torch.log(torch.clamp(cdf_delta, min=1e-12)) + \ - (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) + inner_inner_out = inner_inner_cond * torch.log( + torch.clamp(cdf_delta, min=1e-12) + ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) inner_cond = (y > 0.999).float() - inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond) * inner_inner_out + inner_out = ( + inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out + ) cond = (y < -0.999).float() - log_probs = cond * log_cdf_plus + (1. - cond) * inner_out + log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out log_probs = log_probs + F.log_softmax(logit_probs, -1) if reduce: return -torch.mean(log_sum_exp(log_probs)) - else: - return -log_sum_exp(log_probs).unsqueeze(-1) + return -log_sum_exp(log_probs).unsqueeze(-1) def sample_from_discretized_mix_logistic(y, log_scale_min=None): @@ -127,26 +136,27 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): # sample mixture indicator from softmax temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5) - temp = logit_probs.data - torch.log(- torch.log(temp)) + temp = logit_probs.data - torch.log(-torch.log(temp)) _, argmax = temp.max(dim=-1) # (B, T) -> (B, T, nr_mix) one_hot = to_one_hot(argmax, nr_mix) # select logistic parameters - means = torch.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, dim=-1) - log_scales = torch.clamp(torch.sum( - y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, dim=-1), min=log_scale_min) + means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1) + log_scales = torch.clamp( + torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min + ) # sample from logistic & clip to interval # we don't actually round to the nearest 8bit value when sampling u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5) - x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u)) + x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1.0 - u)) - x = torch.clamp(torch.clamp(x, min=-1.), max=1.) + x = torch.clamp(torch.clamp(x, min=-1.0), max=1.0) return x -def to_one_hot(tensor, n, fill_with=1.): +def to_one_hot(tensor, n, fill_with=1.0): # we perform one hot encore with respect to the last axis one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_() if tensor.is_cuda: From 8de7c13708c03faedc22fb9940c1a430c45e22f3 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 19 Oct 2020 15:47:12 +0200 Subject: [PATCH 28/98] fix no loss masking loss computation --- TTS/tts/layers/losses.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index f07851ac..8256c0f7 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -2,6 +2,7 @@ import math import numpy as np import torch from torch import nn +from inspect import signature from torch.nn import functional from TTS.tts.utils.generic_utils import sequence_mask @@ -142,7 +143,11 @@ class DifferentailSpectralLoss(nn.Module): def forward(self, x, target, length): x_diff = x[:, 1:] - x[:, :-1] target_diff = target[:, 1:] - target[:, :-1] - return self.loss_func(x_diff, target_diff, length-1) + if len(signature(self.loss_func).parameters) > 2: + return self.loss_func(x_diff, target_diff, length-1) + else: + # if loss masking is not enabled + return self.loss_func(x_diff, target_diff) class GuidedAttentionLoss(torch.nn.Module): @@ -262,8 +267,11 @@ class TacotronLoss(torch.nn.Module): # double decoder consistency loss (if enabled) if self.config.double_decoder_consistency: - decoder_b_loss = self.criterion(decoder_b_output, mel_input, - output_lens) + if self.config.loss_masking: + decoder_b_loss = self.criterion(decoder_b_output, mel_input, + output_lens) + else: + decoder_b_loss = self.criterion(decoder_b_output, mel_input) # decoder_c_loss = torch.nn.functional.l1_loss(decoder_b_output, decoder_output) attention_c_loss = torch.nn.functional.l1_loss(alignments, alignments_backwards) loss += self.decoder_alpha * (decoder_b_loss + attention_c_loss) From 016a77fcf22c973817589760c7e8f461460140b1 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Mon, 19 Oct 2020 16:20:15 +0200 Subject: [PATCH 29/98] fix formatting + pylint --- TTS/bin/compute_statistics.py | 52 ++--- TTS/bin/train_gan_vocoder.py | 344 +++++++++++++---------------- TTS/vocoder/models/wavernn.py | 47 ++-- TTS/vocoder/utils/distribution.py | 15 +- TTS/vocoder/utils/generic_utils.py | 101 ++++----- 5 files changed, 252 insertions(+), 307 deletions(-) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 9177c75b..ca089d3e 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -15,23 +15,17 @@ from TTS.utils.audio import AudioProcessor def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( - description="Compute mean and variance of spectrogtram features." - ) - parser.add_argument( - "--config_path", - type=str, - required=True, - help="TTS config file path to define audio processin parameters.", - ) - parser.add_argument( - "--out_path", default=None, type=str, help="directory to save the output file." - ) + description="Compute mean and variance of spectrogtram features.") + parser.add_argument("--config_path", type=str, required=True, + help="TTS config file path to define audio processin parameters.") + parser.add_argument("--out_path", default=None, type=str, + help="directory to save the output file.") args = parser.parse_args() # load config CONFIG = load_config(args.config_path) - CONFIG.audio["signal_norm"] = False # do not apply earlier normalization - CONFIG.audio["stats_path"] = None # discard pre-defined stats + CONFIG.audio['signal_norm'] = False # do not apply earlier normalization + CONFIG.audio['stats_path'] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) @@ -65,27 +59,27 @@ def main(): output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} - stats["mel_mean"] = mel_mean - stats["mel_std"] = mel_scale - stats["linear_mean"] = linear_mean - stats["linear_std"] = linear_scale + stats['mel_mean'] = mel_mean + stats['mel_std'] = mel_scale + stats['linear_mean'] = linear_mean + stats['linear_std'] = linear_scale - print(f" > Avg mel spec mean: {mel_mean.mean()}") - print(f" > Avg mel spec scale: {mel_scale.mean()}") - print(f" > Avg linear spec mean: {linear_mean.mean()}") - print(f" > Avg lienar spec scale: {linear_scale.mean()}") + print(f' > Avg mel spec mean: {mel_mean.mean()}') + print(f' > Avg mel spec scale: {mel_scale.mean()}') + print(f' > Avg linear spec mean: {linear_mean.mean()}') + print(f' > Avg lienar spec scale: {linear_scale.mean()}') # set default config values for mean-var scaling - CONFIG.audio["stats_path"] = output_file_path - CONFIG.audio["signal_norm"] = True + CONFIG.audio['stats_path'] = output_file_path + CONFIG.audio['signal_norm'] = True # remove redundant values - del CONFIG.audio["max_norm"] - del CONFIG.audio["min_level_db"] - del CONFIG.audio["symmetric_norm"] - del CONFIG.audio["clip_norm"] - stats["audio_config"] = CONFIG.audio + del CONFIG.audio['max_norm'] + del CONFIG.audio['min_level_db'] + del CONFIG.audio['symmetric_norm'] + del CONFIG.audio['clip_norm'] + stats['audio_config'] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) - print(f" > scale_stats.npy is saved to {output_file_path}") + print(f' > scale_stats.npy is saved to {output_file_path}') if __name__ == "__main__": diff --git a/TTS/bin/train_gan_vocoder.py b/TTS/bin/train_gan_vocoder.py index 7689c930..12edf048 100644 --- a/TTS/bin/train_gan_vocoder.py +++ b/TTS/bin/train_gan_vocoder.py @@ -10,29 +10,20 @@ import torch from torch.utils.data import DataLoader from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.generic_utils import ( - KeepAverage, - count_parameters, - create_experiment_folder, - get_git_branch, - remove_experiment_folder, - set_init_dict, -) +from TTS.utils.generic_utils import (KeepAverage, count_parameters, + create_experiment_folder, get_git_branch, + remove_experiment_folder, set_init_dict) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data - # from distribute import (DistributedSampler, apply_gradient_allreduce, # init_distributed, reduce_tensor) from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import ( - plot_results, - setup_discriminator, - setup_generator, -) +from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, + setup_generator) from TTS.vocoder.utils.io import save_best_model, save_checkpoint use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -42,30 +33,27 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not c.run_eval: loader = None else: - dataset = GANDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose, - ) + dataset = GANDataset(ap=ap, + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=not is_val, + use_noise_augment=c.use_noise_augment, + use_cache=c.use_cache, + verbose=verbose) dataset.shuffle_mapping() # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=True, - drop_last=False, - sampler=None, - num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, - pin_memory=False, - ) + loader = DataLoader(dataset, + batch_size=1 if is_val else c.batch_size, + shuffle=True, + drop_last=False, + sampler=None, + num_workers=c.num_val_loader_workers + if is_val else c.num_loader_workers, + pin_memory=False) return loader @@ -92,26 +80,16 @@ def format_data(data): return co, x, None, None -def train( - model_G, - criterion_G, - optimizer_G, - model_D, - criterion_D, - optimizer_D, - scheduler_G, - scheduler_D, - ap, - global_step, - epoch, -): +def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, + scheduler_G, scheduler_D, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model_G.train() model_D.train() epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) + batch_n_iter = int( + len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() @@ -167,16 +145,16 @@ def train( scores_fake = D_out_fake # compute losses - loss_G_dict = criterion_G( - y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub - ) - loss_G = loss_G_dict["G_loss"] + loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, + feats_real, y_hat_sub, y_G_sub) + loss_G = loss_G_dict['G_loss'] # optimizer generator optimizer_G.zero_grad() loss_G.backward() if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) + torch.nn.utils.clip_grad_norm_(model_G.parameters(), + c.gen_clip_grad) optimizer_G.step() if scheduler_G is not None: scheduler_G.step() @@ -221,13 +199,14 @@ def train( # compute losses loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict["D_loss"] + loss_D = loss_D_dict['D_loss'] # optimizer discriminator optimizer_D.zero_grad() loss_D.backward() if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) + torch.nn.utils.clip_grad_norm_(model_D.parameters(), + c.disc_clip_grad) optimizer_D.step() if scheduler_D is not None: scheduler_D.step() @@ -242,40 +221,34 @@ def train( epoch_time += step_time # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]["lr"] - current_lr_D = list(optimizer_D.param_groups)[0]["lr"] + current_lr_G = list(optimizer_G.param_groups)[0]['lr'] + current_lr_D = list(optimizer_D.param_groups)[0]['lr'] # update avg stats update_train_values = dict() for key, value in loss_dict.items(): - update_train_values["avg_" + key] = value - update_train_values["avg_loader_time"] = loader_time - update_train_values["avg_step_time"] = step_time + update_train_values['avg_' + key] = value + update_train_values['avg_loader_time'] = loader_time + update_train_values['avg_step_time'] = step_time keep_avg.update_values(update_train_values) # print training stats if global_step % c.print_step == 0: log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], + 'step_time': [step_time, 2], + 'loader_time': [loader_time, 4], "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D, + "current_lr_D": current_lr_D } - c_logger.print_train_step( - batch_n_iter, - num_iter, - global_step, - log_dict, - loss_dict, - keep_avg.avg_values, - ) + c_logger.print_train_step(batch_n_iter, num_iter, global_step, + log_dict, loss_dict, keep_avg.avg_values) # plot step stats if global_step % 10 == 0: iter_stats = { "lr_G": current_lr_G, "lr_D": current_lr_D, - "step_time": step_time, + "step_time": step_time } iter_stats.update(loss_dict) tb_logger.tb_train_iter_stats(global_step, iter_stats) @@ -284,28 +257,27 @@ def train( if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint( - model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - ) + save_checkpoint(model_G, + optimizer_G, + scheduler_G, + model_D, + optimizer_D, + scheduler_D, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict) # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") + figures = plot_results(y_hat_vis, y_G, ap, global_step, + 'train') tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios( - global_step, {"train/audio": sample_voice}, c.audio["sample_rate"] - ) + tb_logger.tb_train_audios(global_step, + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -379,9 +351,8 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) feats_fake, feats_real = None, None # compute losses - loss_G_dict = criterion_G( - y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub - ) + loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, + feats_real, y_hat_sub, y_G_sub) loss_dict = dict() for key, value in loss_G_dict.items(): @@ -437,9 +408,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) # update avg stats update_eval_values = dict() for key, value in loss_dict.items(): - update_eval_values["avg_" + key] = value - update_eval_values["avg_loader_time"] = loader_time - update_eval_values["avg_step_time"] = step_time + update_eval_values['avg_' + key] = value + update_eval_values['avg_loader_time'] = loader_time + update_eval_values['avg_step_time'] = step_time keep_avg.update_values(update_eval_values) # print eval stats @@ -447,14 +418,13 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, "eval") + figures = plot_results(y_hat, y_G, ap, global_step, 'eval') tb_logger.tb_eval_figures(global_step, figures) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios( - global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"] - ) + tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, + c.audio["sample_rate"]) # synthesize a full voice data_loader.return_segments = False @@ -472,8 +442,7 @@ def main(args): # pylint: disable=redefined-outer-name if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") eval_data, train_data = load_wav_feat_data( - c.data_path, c.feature_path, c.eval_split_size - ) + c.data_path, c.feature_path, c.eval_split_size) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -491,63 +460,68 @@ def main(args): # pylint: disable=redefined-outer-name # setup optimizers optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0) - optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0) + optimizer_disc = RAdam(model_disc.parameters(), + lr=c.lr_disc, + weight_decay=0) # schedulers scheduler_gen = None scheduler_disc = None - if "lr_scheduler_gen" in c: + if 'lr_scheduler_gen' in c: scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) - scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if "lr_scheduler_disc" in c: + scheduler_gen = scheduler_gen( + optimizer_gen, **c.lr_scheduler_gen_params) + if 'lr_scheduler_disc' in c: scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) - scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) + scheduler_disc = scheduler_disc( + optimizer_disc, **c.lr_scheduler_disc_params) # setup criterion criterion_gen = GeneratorLoss(c) criterion_disc = DiscriminatorLoss(c) if args.restore_path: - checkpoint = torch.load(args.restore_path, map_location="cpu") + checkpoint = torch.load(args.restore_path, map_location='cpu') try: print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint["model"]) + model_gen.load_state_dict(checkpoint['model']) print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint["optimizer"]) + optimizer_gen.load_state_dict(checkpoint['optimizer']) print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint["model_disc"]) + model_disc.load_state_dict(checkpoint['model_disc']) print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) - if "scheduler" in checkpoint: + optimizer_disc.load_state_dict(checkpoint['optimizer_disc']) + if 'scheduler' in checkpoint: print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint["scheduler"]) + scheduler_gen.load_state_dict(checkpoint['scheduler']) # NOTE: Not sure if necessary scheduler_gen.optimizer = optimizer_gen - if "scheduler_disc" in checkpoint: + if 'scheduler_disc' in checkpoint: print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) + scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) scheduler_disc.optimizer = optimizer_disc except RuntimeError: # retore only matching layers. print(" > Partial model initialization...") model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) + model_dict = set_init_dict(model_dict, checkpoint['model'], c) model_gen.load_state_dict(model_dict) model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) + model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c) model_disc.load_state_dict(model_dict) del model_dict # reset lr if not countinuining training. for group in optimizer_gen.param_groups: - group["lr"] = c.lr_gen + group['lr'] = c.lr_gen for group in optimizer_disc.param_groups: - group["lr"] = c.lr_disc + group['lr'] = c.lr_disc - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] + print(" > Model restored from step %d" % checkpoint['step'], + flush=True) + args.restore_step = checkpoint['step'] else: args.restore_step = 0 @@ -566,92 +540,74 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model_disc) print(" > Discriminator has {} parameters".format(num_params), flush=True) - if "best_loss" not in locals(): - best_loss = float("inf") + if 'best_loss' not in locals(): + best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train( - model_gen, - criterion_gen, - optimizer_gen, - model_disc, - criterion_disc, - optimizer_disc, - scheduler_gen, - scheduler_disc, - ap, - global_step, - epoch, - ) - eval_avg_loss_dict = evaluate( - model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch - ) + _, global_step = train(model_gen, criterion_gen, optimizer_gen, + model_disc, criterion_disc, optimizer_disc, + scheduler_gen, scheduler_disc, ap, global_step, + epoch) + eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model( - target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict, - ) + best_loss = save_best_model(target_loss, + best_loss, + model_gen, + optimizer_gen, + scheduler_gen, + model_disc, + optimizer_disc, + scheduler_disc, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict) -if __name__ == "__main__": +if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( - "--continue_path", + '--continue_path', type=str, help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default="", - required="--config_path" not in sys.argv, - ) + default='', + required='--config_path' not in sys.argv) parser.add_argument( - "--restore_path", + '--restore_path', type=str, - help="Model file to be restored. Use to finetune a model.", - default="", - ) - parser.add_argument( - "--config_path", - type=str, - help="Path to config file for training.", - required="--continue_path" not in sys.argv, - ) - parser.add_argument( - "--debug", - type=bool, - default=False, - help="Do not verify commit integrity to run training.", - ) + help='Model file to be restored. Use to finetune a model.', + default='') + parser.add_argument('--config_path', + type=str, + help='Path to config file for training.', + required='--continue_path' not in sys.argv) + parser.add_argument('--debug', + type=bool, + default=False, + help='Do not verify commit integrity to run training.') # DISTRUBUTED parser.add_argument( - "--rank", + '--rank', type=int, default=0, - help="DISTRIBUTED: process rank for distributed training.", - ) - parser.add_argument( - "--group_id", type=str, default="", help="DISTRIBUTED: process group id." - ) + help='DISTRIBUTED: process rank for distributed training.') + parser.add_argument('--group_id', + type=str, + default="", + help='DISTRIBUTED: process group id.') args = parser.parse_args() - if args.continue_path != "": + if args.continue_path != '': args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, "config.json") + args.config_path = os.path.join(args.continue_path, 'config.json') list_of_files = glob.glob( - args.continue_path + "/*.pth.tar" - ) # * means all if need specific format then *.csv + args.continue_path + + "/*.pth.tar") # * means all if need specific format then *.csv latest_model_file = max(list_of_files, key=os.path.getctime) args.restore_path = latest_model_file print(f" > Training continues for {args.restore_path}") @@ -662,10 +618,11 @@ if __name__ == "__main__": _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path - if args.continue_path == "": - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) + if args.continue_path == '': + OUT_PATH = create_experiment_folder(c.output_path, c.run_name, + args.debug) - AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") + AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') c_logger = ConsoleLogger() @@ -675,17 +632,16 @@ if __name__ == "__main__": if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_config_file( - args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields - ) + copy_config_file(args.config_path, + os.path.join(OUT_PATH, 'config.json'), new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") + tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') # write model desc to tensorboard - tb_logger.tb_add_text("model-description", c["run_description"], 0) + tb_logger.tb_add_text('model-description', c['run_description'], 0) try: main(args) @@ -698,4 +654,4 @@ if __name__ == "__main__": except Exception: # pylint: disable=broad-except remove_experiment_folder(OUT_PATH) traceback.print_exc() - sys.exit(1) \ No newline at end of file + sys.exit(1) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 9b637a6a..4d1a633c 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -365,28 +365,6 @@ class WaveRNN(nn.Module): (i * b_size, seq_len * b_size, b_size, gen_rate, realtime_ratio), ) - @staticmethod - def get_gru_cell(gru): - gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) - gru_cell.weight_hh.data = gru.weight_hh_l0.data - gru_cell.weight_ih.data = gru.weight_ih_l0.data - gru_cell.bias_hh.data = gru.bias_hh_l0.data - gru_cell.bias_ih.data = gru.bias_ih_l0.data - return gru_cell - - @staticmethod - def pad_tensor(x, pad, side="both"): - # NB - this is just a quick method i need right now - # i.e., it won't generalise to other shapes/dims - b, t, c = x.size() - total = t + 2 * pad if side == "both" else t + pad - padded = torch.zeros(b, total, c).cuda() - if side in ("before", "both"): - padded[:, pad : pad + t, :] = x - elif side == "after": - padded[:, :t, :] = x - return padded - def fold_with_overlap(self, x, target, overlap): """Fold the tensor with overlap for quick batched inference. @@ -430,7 +408,30 @@ class WaveRNN(nn.Module): return folded - def xfade_and_unfold(self, y, target, overlap): + @staticmethod + def get_gru_cell(gru): + gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size) + gru_cell.weight_hh.data = gru.weight_hh_l0.data + gru_cell.weight_ih.data = gru.weight_ih_l0.data + gru_cell.bias_hh.data = gru.bias_hh_l0.data + gru_cell.bias_ih.data = gru.bias_ih_l0.data + return gru_cell + + @staticmethod + def pad_tensor(x, pad, side="both"): + # NB - this is just a quick method i need right now + # i.e., it won't generalise to other shapes/dims + b, t, c = x.size() + total = t + 2 * pad if side == "both" else t + pad + padded = torch.zeros(b, total, c).cuda() + if side in ("before", "both"): + padded[:, pad : pad + t, :] = x + elif side == "after": + padded[:, :t, :] = x + return padded + + @staticmethod + def xfade_and_unfold(y, target, overlap): """Applies a crossfade and unfolds into a 1d array. Args: diff --git a/TTS/vocoder/utils/distribution.py b/TTS/vocoder/utils/distribution.py index 705c14dc..6aba5e34 100644 --- a/TTS/vocoder/utils/distribution.py +++ b/TTS/vocoder/utils/distribution.py @@ -28,7 +28,8 @@ def sample_from_gaussian(y_hat, log_std_min=-7.0, scale_factor=1.0): torch.exp(log_std), ) sample = dist.sample() - sample = torch.clamp(torch.clamp(sample, min=-scale_factor), max=scale_factor) + sample = torch.clamp(torch.clamp( + sample, min=-scale_factor), max=scale_factor) del dist return sample @@ -58,8 +59,9 @@ def discretized_mix_logistic_loss( # unpack parameters. (B, T, num_mixtures) x 3 logit_probs = y_hat[:, :, :nr_mix] - means = y_hat[:, :, nr_mix : 2 * nr_mix] - log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min) + means = y_hat[:, :, nr_mix: 2 * nr_mix] + log_scales = torch.clamp( + y_hat[:, :, 2 * nr_mix: 3 * nr_mix], min=log_scale_min) # B x T x 1 -> B x T x num_mixtures y = y.expand_as(means) @@ -104,7 +106,8 @@ def discretized_mix_logistic_loss( ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2)) inner_cond = (y > 0.999).float() inner_out = ( - inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out + inner_cond * log_one_minus_cdf_min + + (1.0 - inner_cond) * inner_inner_out ) cond = (y < -0.999).float() log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out @@ -142,9 +145,9 @@ def sample_from_discretized_mix_logistic(y, log_scale_min=None): # (B, T) -> (B, T, nr_mix) one_hot = to_one_hot(argmax, nr_mix) # select logistic parameters - means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1) + means = torch.sum(y[:, :, nr_mix: 2 * nr_mix] * one_hot, dim=-1) log_scales = torch.clamp( - torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min + torch.sum(y[:, :, 2 * nr_mix: 3 * nr_mix] * one_hot, dim=-1), min=log_scale_min ) # sample from logistic & clip to interval # we don't actually round to the nearest 8bit value when sampling diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index c73c5248..c16fa1ae 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -39,7 +39,7 @@ def plot_results(y_hat, y, ap, global_step, name_prefix): def to_camel(text): text = text.capitalize() - return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) + return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) def setup_wavernn(c): @@ -67,101 +67,92 @@ def setup_wavernn(c): def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model)) - MyModel = importlib.import_module("TTS.vocoder.models." + c.generator_model.lower()) + MyModel = importlib.import_module('TTS.vocoder.models.' + + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in "melgan_generator": + if c.generator_model in 'melgan_generator': model = MyModel( - in_channels=c.audio["num_mels"], + in_channels=c.audio['num_mels'], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], + upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "melgan_fb_generator": + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'melgan_fb_generator': pass - if c.generator_model in "multiband_melgan_generator": + if c.generator_model in 'multiband_melgan_generator': model = MyModel( - in_channels=c.audio["num_mels"], + in_channels=c.audio['num_mels'], out_channels=4, proj_kernel=7, base_channels=384, - upsample_factors=c.generator_model_params["upsample_factors"], + upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "fullband_melgan_generator": + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'fullband_melgan_generator': model = MyModel( - in_channels=c.audio["num_mels"], + in_channels=c.audio['num_mels'], out_channels=1, proj_kernel=7, base_channels=512, - upsample_factors=c.generator_model_params["upsample_factors"], + upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - ) - if c.generator_model in "parallel_wavegan_generator": + num_res_blocks=c.generator_model_params['num_res_blocks']) + if c.generator_model in 'parallel_wavegan_generator': model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_res_blocks=c.generator_model_params["num_res_blocks"], - stacks=c.generator_model_params["stacks"], + num_res_blocks=c.generator_model_params['num_res_blocks'], + stacks=c.generator_model_params['stacks'], res_channels=64, gate_channels=128, skip_channels=64, - aux_channels=c.audio["num_mels"], + aux_channels=c.audio['num_mels'], dropout=0.0, bias=True, use_weight_norm=True, - upsample_factors=c.generator_model_params["upsample_factors"], - ) + upsample_factors=c.generator_model_params['upsample_factors']) return model def setup_discriminator(c): print(" > Discriminator Model: {}".format(c.discriminator_model)) - if "parallel_wavegan" in c.discriminator_model: + if 'parallel_wavegan' in c.discriminator_model: MyModel = importlib.import_module( - "TTS.vocoder.models.parallel_wavegan_discriminator" - ) + 'TTS.vocoder.models.parallel_wavegan_discriminator') else: - MyModel = importlib.import_module( - "TTS.vocoder.models." + c.discriminator_model.lower() - ) + MyModel = importlib.import_module('TTS.vocoder.models.' + + c.discriminator_model.lower()) MyModel = getattr(MyModel, to_camel(c.discriminator_model.lower())) - if c.discriminator_model in "random_window_discriminator": + if c.discriminator_model in 'random_window_discriminator': model = MyModel( - cond_channels=c.audio["num_mels"], - hop_length=c.audio["hop_length"], - uncond_disc_donwsample_factors=c.discriminator_model_params[ - "uncond_disc_donwsample_factors" - ], - cond_disc_downsample_factors=c.discriminator_model_params[ - "cond_disc_downsample_factors" - ], - cond_disc_out_channels=c.discriminator_model_params[ - "cond_disc_out_channels" - ], - window_sizes=c.discriminator_model_params["window_sizes"], - ) - if c.discriminator_model in "melgan_multiscale_discriminator": + cond_channels=c.audio['num_mels'], + hop_length=c.audio['hop_length'], + uncond_disc_donwsample_factors=c. + discriminator_model_params['uncond_disc_donwsample_factors'], + cond_disc_downsample_factors=c. + discriminator_model_params['cond_disc_downsample_factors'], + cond_disc_out_channels=c. + discriminator_model_params['cond_disc_out_channels'], + window_sizes=c.discriminator_model_params['window_sizes']) + if c.discriminator_model in 'melgan_multiscale_discriminator': model = MyModel( in_channels=1, out_channels=1, kernel_sizes=(5, 3), - base_channels=c.discriminator_model_params["base_channels"], - max_channels=c.discriminator_model_params["max_channels"], - downsample_factors=c.discriminator_model_params["downsample_factors"], - ) - if c.discriminator_model == "residual_parallel_wavegan_discriminator": + base_channels=c.discriminator_model_params['base_channels'], + max_channels=c.discriminator_model_params['max_channels'], + downsample_factors=c. + discriminator_model_params['downsample_factors']) + if c.discriminator_model == 'residual_parallel_wavegan_discriminator': model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], - stacks=c.discriminator_model_params["stacks"], + num_layers=c.discriminator_model_params['num_layers'], + stacks=c.discriminator_model_params['stacks'], res_channels=64, gate_channels=128, skip_channels=64, @@ -170,17 +161,17 @@ def setup_discriminator(c): nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, ) - if c.discriminator_model == "parallel_wavegan_discriminator": + if c.discriminator_model == 'parallel_wavegan_discriminator': model = MyModel( in_channels=1, out_channels=1, kernel_size=3, - num_layers=c.discriminator_model_params["num_layers"], + num_layers=c.discriminator_model_params['num_layers'], conv_channels=64, dilation_factor=1, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.2}, - bias=True, + bias=True ) return model From ea9d8755defd9c1b86b4e3925b1133d8340bd238 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 10:39:20 +0200 Subject: [PATCH 30/98] add wavernn tests + name refactoring --- tests/inputs/test_vocoder_wavernn_config.json | 94 +++++++++++++++++++ ...tasets.py => test_vocoder_gan_datasets.py} | 0 ...der_train.sh => test_vocoder_gan_train.sh} | 4 +- tests/test_vocoder_wavernn.py | 31 ++++++ tests/test_vocoder_wavernn_datasets.py | 91 ++++++++++++++++++ tests/test_vocoder_wavernn_train.sh | 15 +++ 6 files changed, 233 insertions(+), 2 deletions(-) create mode 100644 tests/inputs/test_vocoder_wavernn_config.json rename tests/{test_vocoder_datasets.py => test_vocoder_gan_datasets.py} (100%) rename tests/{test_vocoder_train.sh => test_vocoder_gan_train.sh} (57%) create mode 100644 tests/test_vocoder_wavernn.py create mode 100644 tests/test_vocoder_wavernn_datasets.py create mode 100755 tests/test_vocoder_wavernn_train.sh diff --git a/tests/inputs/test_vocoder_wavernn_config.json b/tests/inputs/test_vocoder_wavernn_config.json new file mode 100644 index 00000000..28c0f059 --- /dev/null +++ b/tests/inputs/test_vocoder_wavernn_config.json @@ -0,0 +1,94 @@ +{ + "run_name": "wavernn_test", + "run_description": "wavernn_test training", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length + "seq_len": 1280, // has to be devideable by hop_length + "mode": "mold", // mold [string], gauss [string], bits [int] + "mulaw": false, // apply mulaw if mode is bits + "padding": 2, // pad the input for resnet to see wider input length + + // DATASET + //"use_gta": true, // use computed gta features from the tts model + "data_path": "tests/data/ljspeech/wavs/", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them + + // TRAINING + "batch_size": 4, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "epochs": 1, // total number of epochs to train. + + // VALIDATION + "run_eval": true, + "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) + + // OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [200000, 400000, 600000] + }, + "lr": 1e-4, // initial learning rate + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, // number of samples for testing + + // PATHS + "output_path": "tests/train_outputs/" +} + diff --git a/tests/test_vocoder_datasets.py b/tests/test_vocoder_gan_datasets.py similarity index 100% rename from tests/test_vocoder_datasets.py rename to tests/test_vocoder_gan_datasets.py diff --git a/tests/test_vocoder_train.sh b/tests/test_vocoder_gan_train.sh similarity index 57% rename from tests/test_vocoder_train.sh rename to tests/test_vocoder_gan_train.sh index fa99b4bd..75773cc3 100755 --- a/tests/test_vocoder_train.sh +++ b/tests/test_vocoder_gan_train.sh @@ -5,11 +5,11 @@ echo "$BASEDIR" # create run dir mkdir $BASEDIR/train_outputs # run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json # find the training folder LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) echo $LATEST_FOLDER # continue the previous training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER # remove all the outputs rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER diff --git a/tests/test_vocoder_wavernn.py b/tests/test_vocoder_wavernn.py new file mode 100644 index 00000000..fdb338f9 --- /dev/null +++ b/tests/test_vocoder_wavernn.py @@ -0,0 +1,31 @@ +import numpy as np +import torch +import random +from TTS.vocoder.models.wavernn import WaveRNN + + +def test_wavernn(): + model = WaveRNN( + rnn_dims=512, + fc_dims=512, + mode=10, + mulaw=False, + pad=2, + use_aux_net=True, + use_upsample_net=True, + upsample_factors=[4, 8, 8], + feat_dims=80, + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=256, + sample_rate=22050, + ) + dummy_x = torch.rand((2, 1280)) + dummy_m = torch.rand((2, 80, 9)) + y_size = random.randrange(20, 60) + dummy_y = torch.rand((80, y_size)) + output = model(dummy_x, dummy_m) + assert np.all(output.shape == (2, 1280, 4 * 256)), output.shape + output = model.generate(dummy_y, True, 5500, 550, False) + assert np.all(output.shape == (256 * (y_size - 1),)) diff --git a/tests/test_vocoder_wavernn_datasets.py b/tests/test_vocoder_wavernn_datasets.py new file mode 100644 index 00000000..0f4e939a --- /dev/null +++ b/tests/test_vocoder_wavernn_datasets.py @@ -0,0 +1,91 @@ +import os +import shutil + +import numpy as np +from tests import get_tests_path, get_tests_input_path, get_tests_output_path +from torch.utils.data import DataLoader + +from TTS.utils.audio import AudioProcessor +from TTS.utils.io import load_config +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.vocoder.datasets.preprocess import load_wav_feat_data, preprocess_wav_files + +file_path = os.path.dirname(os.path.realpath(__file__)) +OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/") +os.makedirs(OUTPATH, exist_ok=True) + +C = load_config(os.path.join(get_tests_input_path(), + "test_vocoder_wavernn_config.json")) + +test_data_path = os.path.join(get_tests_path(), "data/ljspeech/") +test_mel_feat_path = os.path.join(test_data_path, "mel") +test_quant_feat_path = os.path.join(test_data_path, "quant") +ok_ljspeech = os.path.exists(test_data_path) + + +def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): + """ run dataloader with given parameters and check conditions """ + ap = AudioProcessor(**C.audio) + + C.batch_size = batch_size + C.mode = mode + C.seq_len = seq_len + C.data_path = test_data_path + + preprocess_wav_files(test_data_path, C, ap) + _, train_items = load_wav_feat_data( + test_data_path, test_mel_feat_path, 5) + + dataset = WaveRNNDataset(ap=ap, + items=train_items, + seq_len=seq_len, + hop_len=hop_len, + pad=pad, + mode=mode, + ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader(dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=batch_size, + num_workers=num_workers, + pin_memory=True, + ) + + max_iter = 10 + count_iter = 0 + + try: + for data in loader: + x_input, mels, _ = data + expected_feat_shape = (ap.num_mels, + (x_input.shape[-1] // hop_len) + (pad * 2)) + assert np.all( + mels.shape[1:] == expected_feat_shape), f" [!] {mels.shape} vs {expected_feat_shape}" + + assert (mels.shape[2] - pad * 2) * hop_len == x_input.shape[1] + count_iter += 1 + if count_iter == max_iter: + break + # except AssertionError: + # shutil.rmtree(test_mel_feat_path) + # shutil.rmtree(test_quant_feat_path) + finally: + shutil.rmtree(test_mel_feat_path) + shutil.rmtree(test_quant_feat_path) + + +def test_parametrized_wavernn_dataset(): + ''' test dataloader with different parameters ''' + params = [ + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", 4], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", 0], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, 2], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", 0], + ] + for param in params: + print(param) + wavernn_dataset_case(*param) diff --git a/tests/test_vocoder_wavernn_train.sh b/tests/test_vocoder_wavernn_train.sh new file mode 100755 index 00000000..f2e32116 --- /dev/null +++ b/tests/test_vocoder_wavernn_train.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# create run dir +mkdir $BASEDIR/train_outputs +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER \ No newline at end of file From 91e5f8b63dc9f1ba3eed123ce462b1000569e8b1 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 10:44:00 +0200 Subject: [PATCH 31/98] added to device cpu/gpu + formatting --- TTS/bin/train_wavernn_vocoder.py | 182 ++++++++++++------------ TTS/vocoder/datasets/wavernn_dataset.py | 34 ++--- TTS/vocoder/models/wavernn.py | 66 +++++---- 3 files changed, 145 insertions(+), 137 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 78984510..66a7c913 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -44,43 +44,41 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not CONFIG.run_eval: loader = None else: - dataset = WaveRNNDataset( - ap=ap, - items=eval_data if is_val else train_data, - seq_len=CONFIG.seq_len, - hop_len=ap.hop_length, - pad=CONFIG.padding, - mode=CONFIG.mode, - is_training=not is_val, - verbose=verbose, - ) + dataset = WaveRNNDataset(ap=ap, + items=eval_data if is_val else train_data, + seq_len=CONFIG.seq_len, + hop_len=ap.hop_length, + pad=CONFIG.padding, + mode=CONFIG.mode, + is_training=not is_val, + verbose=verbose, + ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - shuffle=True, - collate_fn=dataset.collate, - batch_size=CONFIG.batch_size, - num_workers=CONFIG.num_val_loader_workers - if is_val - else CONFIG.num_loader_workers, - pin_memory=True, - ) + loader = DataLoader(dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=CONFIG.batch_size, + num_workers=CONFIG.num_val_loader_workers + if is_val + else CONFIG.num_loader_workers, + pin_memory=True, + ) return loader def format_data(data): # setup input data - x = data[0] - m = data[1] - y = data[2] + x_input = data[0] + mels = data[1] + y_coarse = data[2] # dispatch data to GPU if use_cuda: - x = x.cuda(non_blocking=True) - m = m.cuda(non_blocking=True) - y = y.cuda(non_blocking=True) + x_input = x_input.cuda(non_blocking=True) + mels = mels.cuda(non_blocking=True) + y_coarse = y_coarse.cuda(non_blocking=True) - return x, m, y + return x_input, mels, y_coarse def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): @@ -90,7 +88,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus)) + batch_n_iter = int(len(data_loader.dataset) / + (CONFIG.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) end_time = time.time() @@ -99,30 +98,31 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): print(" > Training", flush=True) for num_iter, data in enumerate(data_loader): start_time = time.time() - x, m, y = format_data(data) + x_input, mels, y_coarse = format_data(data) loader_time = time.time() - end_time global_step += 1 ################## # MODEL TRAINING # ################## - y_hat = model(x, m) + y_hat = model(x_input, mels) if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: - y = y.float() - y = y.unsqueeze(-1) + y_coarse = y_coarse.float() + y_coarse = y_coarse.unsqueeze(-1) # m_scaled, _ = model.upsample(m) # compute losses - loss = criterion(y_hat, y) + loss = criterion(y_hat, y_coarse) if loss.item() is None: raise RuntimeError(" [!] None loss. Exiting ...") optimizer.zero_grad() loss.backward() if CONFIG.grad_clip > 0: - torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip) + torch.nn.utils.clip_grad_norm_( + model.parameters(), CONFIG.grad_clip) optimizer.step() if scheduler is not None: @@ -145,19 +145,17 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): # print training stats if global_step % CONFIG.print_step == 0: - log_dict = { - "step_time": [step_time, 2], - "loader_time": [loader_time, 4], - "current_lr": cur_lr, - } - c_logger.print_train_step( - batch_n_iter, - num_iter, - global_step, - log_dict, - loss_dict, - keep_avg.avg_values, - ) + log_dict = {"step_time": [step_time, 2], + "loader_time": [loader_time, 4], + "current_lr": cur_lr, + } + c_logger.print_train_step(batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) # plot step stats if global_step % 10 == 0: @@ -169,40 +167,38 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): if global_step % CONFIG.save_step == 0: if CONFIG.checkpoint: # save model - save_checkpoint( - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - ) + save_checkpoint(model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) # synthesize a full voice wav_path = train_data[random.randrange(0, len(train_data))][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) - sample_wav = model.generate( - ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, - ) + sample_wav = model.generate(ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms - figures = { - "train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), - } + figures = {"train/ground_truth": plot_spectrogram(ground_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T), + } # Sample audio tb_logger.tb_train_audios( - global_step, {"train/audio": sample_wav}, CONFIG.audio["sample_rate"] + global_step, { + "train/audio": sample_wav}, CONFIG.audio["sample_rate"] ) tb_logger.tb_train_figures(global_step, figures) @@ -234,17 +230,17 @@ def evaluate(model, criterion, ap, global_step, epoch): for num_iter, data in enumerate(data_loader): start_time = time.time() # format data - x, m, y = format_data(data) + x_input, mels, y_coarse = format_data(data) loader_time = time.time() - end_time global_step += 1 - y_hat = model(x, m) + y_hat = model(x_input, mels) if isinstance(model.mode, int): y_hat = y_hat.transpose(1, 2).unsqueeze(-1) else: - y = y.float() - y = y.unsqueeze(-1) - loss = criterion(y_hat, y) + y_coarse = y_coarse.float() + y_coarse = y_coarse.unsqueeze(-1) + loss = criterion(y_hat, y_coarse) # Compute avg loss # if num_gpus > 1: # loss = reduce_tensor(loss.data, num_gpus) @@ -264,30 +260,31 @@ def evaluate(model, criterion, ap, global_step, epoch): # print eval stats if CONFIG.print_eval: - c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) + c_logger.print_eval_step( + num_iter, loss_dict, keep_avg.avg_values) - if epoch % CONFIG.test_every_epochs == 0: + if epoch % CONFIG.test_every_epochs == 0 and epoch != 0: # synthesize a part of data wav_path = eval_data[random.randrange(0, len(eval_data))][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav[:22000]) - sample_wav = model.generate( - ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, - ) + sample_wav = model.generate(ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + use_cuda + ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms - figures = { - "eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } + figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T), + } # Sample audio tb_logger.tb_eval_audios( - global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + global_step, { + "eval/audio": sample_wav}, CONFIG.audio["sample_rate"] ) tb_logger.tb_eval_figures(global_step, figures) @@ -372,7 +369,8 @@ def main(args): # pylint: disable=redefined-outer-name model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) model_wavernn.load_state_dict(model_dict) - print(" > Model restored from step %d" % checkpoint["step"], flush=True) + print(" > Model restored from step %d" % + checkpoint["step"], flush=True) args.restore_step = checkpoint["step"] else: args.restore_step = 0 @@ -393,7 +391,8 @@ def main(args): # pylint: disable=redefined-outer-name _, global_step = train( model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch ) - eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate( + model_wavernn, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict["avg_model_loss"] best_loss = save_best_model( @@ -493,7 +492,8 @@ if __name__ == "__main__": tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0) + tb_logger.tb_add_text("model-description", + CONFIG["run_description"], 0) try: main(args) diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 5d5b9f15..194344a9 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -8,17 +8,16 @@ class WaveRNNDataset(Dataset): WaveRNN Dataset searchs for all the wav files under root path. """ - def __init__( - self, - ap, - items, - seq_len, - hop_len, - pad, - mode, - is_training=True, - verbose=False, - ): + def __init__(self, + ap, + items, + seq_len, + hop_len, + pad, + mode, + is_training=True, + verbose=False, + ): self.ap = ap self.item_list = items @@ -56,17 +55,19 @@ class WaveRNNDataset(Dataset): def collate(self, batch): mel_win = self.seq_len // self.hop_len + 2 * self.pad - max_offsets = [x[0].shape[-1] - (mel_win + 2 * self.pad) for x in batch] + max_offsets = [x[0].shape[-1] - + (mel_win + 2 * self.pad) for x in batch] mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] - sig_offsets = [(offset + self.pad) * self.hop_len for offset in mel_offsets] + sig_offsets = [(offset + self.pad) * + self.hop_len for offset in mel_offsets] mels = [ - x[0][:, mel_offsets[i] : mel_offsets[i] + mel_win] + x[0][:, mel_offsets[i]: mel_offsets[i] + mel_win] for i, x in enumerate(batch) ] coarse = [ - x[1][sig_offsets[i] : sig_offsets[i] + self.seq_len + 1] + x[1][sig_offsets[i]: sig_offsets[i] + self.seq_len + 1] for i, x in enumerate(batch) ] @@ -79,7 +80,8 @@ class WaveRNNDataset(Dataset): coarse = np.stack(coarse).astype(np.int64) coarse = torch.LongTensor(coarse) x_input = ( - 2 * coarse[:, : self.seq_len].float() / (2 ** self.mode - 1.0) - 1.0 + 2 * coarse[:, : self.seq_len].float() / + (2 ** self.mode - 1.0) - 1.0 ) y_coarse = coarse[:, 1:] mels = torch.FloatTensor(mels) diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 4d1a633c..9b151cac 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -39,7 +39,8 @@ class MelResNet(nn.Module): def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): super().__init__() k_size = pad * 2 + 1 - self.conv_in = nn.Conv1d(in_dims, compute_dims, kernel_size=k_size, bias=False) + self.conv_in = nn.Conv1d( + in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() for _ in range(res_blocks): @@ -94,7 +95,8 @@ class UpsampleNetwork(nn.Module): k_size = (1, scale * 2 + 1) padding = (0, scale) stretch = Stretch2d(scale, 1) - conv = nn.Conv2d(1, 1, kernel_size=k_size, padding=padding, bias=False) + conv = nn.Conv2d(1, 1, kernel_size=k_size, + padding=padding, bias=False) conv.weight.data.fill_(1.0 / k_size[1]) self.up_layers.append(stretch) self.up_layers.append(conv) @@ -110,7 +112,7 @@ class UpsampleNetwork(nn.Module): m = m.unsqueeze(1) for f in self.up_layers: m = f(m) - m = m.squeeze(1)[:, :, self.indent : -self.indent] + m = m.squeeze(1)[:, :, self.indent: -self.indent] return m.transpose(1, 2), aux @@ -123,7 +125,8 @@ class Upsample(nn.Module): self.pad = pad self.indent = pad * scale self.use_aux_net = use_aux_net - self.resnet = MelResNet(res_blocks, feat_dims, compute_dims, res_out_dims, pad) + self.resnet = MelResNet(res_blocks, feat_dims, + compute_dims, res_out_dims, pad) def forward(self, m): if self.use_aux_net: @@ -137,7 +140,7 @@ class Upsample(nn.Module): m = torch.nn.functional.interpolate( m, scale_factor=self.scale, mode="linear", align_corners=True ) - m = m[:, :, self.indent : -self.indent] + m = m[:, :, self.indent: -self.indent] m = m * 0.045 # empirically found return m.transpose(1, 2), aux @@ -207,7 +210,8 @@ class WaveRNN(nn.Module): if self.use_aux_net: self.I = nn.Linear(feat_dims + self.aux_dims + 1, rnn_dims) self.rnn1 = nn.GRU(rnn_dims, rnn_dims, batch_first=True) - self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, rnn_dims, batch_first=True) + self.rnn2 = nn.GRU(rnn_dims + self.aux_dims, + rnn_dims, batch_first=True) self.fc1 = nn.Linear(rnn_dims + self.aux_dims, fc_dims) self.fc2 = nn.Linear(fc_dims + self.aux_dims, fc_dims) self.fc3 = nn.Linear(fc_dims, self.n_classes) @@ -221,16 +225,16 @@ class WaveRNN(nn.Module): def forward(self, x, mels): bsize = x.size(0) - h1 = torch.zeros(1, bsize, self.rnn_dims).cuda() - h2 = torch.zeros(1, bsize, self.rnn_dims).cuda() + h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) + h2 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) mels, aux = self.upsample(mels) if self.use_aux_net: aux_idx = [self.aux_dims * i for i in range(5)] - a1 = aux[:, :, aux_idx[0] : aux_idx[1]] - a2 = aux[:, :, aux_idx[1] : aux_idx[2]] - a3 = aux[:, :, aux_idx[2] : aux_idx[3]] - a4 = aux[:, :, aux_idx[3] : aux_idx[4]] + a1 = aux[:, :, aux_idx[0]: aux_idx[1]] + a2 = aux[:, :, aux_idx[1]: aux_idx[2]] + a3 = aux[:, :, aux_idx[2]: aux_idx[3]] + a4 = aux[:, :, aux_idx[3]: aux_idx[4]] x = ( torch.cat([x.unsqueeze(-1), mels, a1], dim=2) @@ -256,19 +260,21 @@ class WaveRNN(nn.Module): x = F.relu(self.fc2(x)) return self.fc3(x) - def generate(self, mels, batched, target, overlap): + def generate(self, mels, batched, target, overlap, use_cuda): self.eval() + device = 'cuda' if use_cuda else 'cpu' output = [] start = time.time() rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad(): - - mels = torch.FloatTensor(mels).cuda().unsqueeze(0) + mels = torch.FloatTensor(mels).unsqueeze(0).to(device) + #mels = torch.FloatTensor(mels).cuda().unsqueeze(0) wave_len = (mels.size(-1) - 1) * self.hop_length - mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side="both") + mels = self.pad_tensor(mels.transpose( + 1, 2), pad=self.pad, side="both") mels, aux = self.upsample(mels.transpose(1, 2)) if batched: @@ -278,13 +284,13 @@ class WaveRNN(nn.Module): b_size, seq_len, _ = mels.size() - h1 = torch.zeros(b_size, self.rnn_dims).cuda() - h2 = torch.zeros(b_size, self.rnn_dims).cuda() - x = torch.zeros(b_size, 1).cuda() + h1 = torch.zeros(b_size, self.rnn_dims).to(device) + h2 = torch.zeros(b_size, self.rnn_dims).to(device) + x = torch.zeros(b_size, 1).to(device) if self.use_aux_net: d = self.aux_dims - aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] + aux_split = [aux[:, :, d * i: d * (i + 1)] for i in range(4)] for i in range(seq_len): @@ -319,11 +325,12 @@ class WaveRNN(nn.Module): logits.unsqueeze(0).transpose(1, 2) ) output.append(sample.view(-1)) - x = sample.transpose(0, 1).cuda() + x = sample.transpose(0, 1).to(device) elif self.mode == "gauss": - sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) + sample = sample_from_gaussian( + logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) - x = sample.transpose(0, 1).cuda() + x = sample.transpose(0, 1).to(device) elif isinstance(self.mode, int): posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) @@ -332,7 +339,8 @@ class WaveRNN(nn.Module): output.append(sample) x = sample.unsqueeze(-1) else: - raise RuntimeError("Unknown model mode value - ", self.mode) + raise RuntimeError( + "Unknown model mode value - ", self.mode) if i % 100 == 0: self.gen_display(i, seq_len, b_size, start) @@ -352,7 +360,7 @@ class WaveRNN(nn.Module): # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] - output[-20 * self.hop_length :] *= fade_out + output[-20 * self.hop_length:] *= fade_out self.train() return output @@ -366,7 +374,6 @@ class WaveRNN(nn.Module): ) def fold_with_overlap(self, x, target, overlap): - """Fold the tensor with overlap for quick batched inference. Overlap will be used for crossfading in xfade_and_unfold() Args: @@ -398,7 +405,7 @@ class WaveRNN(nn.Module): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side="after") - folded = torch.zeros(num_folds, target + 2 * overlap, features).cuda() + folded = torch.zeros(num_folds, target + 2 * overlap, features).to(x.device) # Get the values for the folded tensor for i in range(num_folds): @@ -423,16 +430,15 @@ class WaveRNN(nn.Module): # i.e., it won't generalise to other shapes/dims b, t, c = x.size() total = t + 2 * pad if side == "both" else t + pad - padded = torch.zeros(b, total, c).cuda() + padded = torch.zeros(b, total, c).to(x.device) if side in ("before", "both"): - padded[:, pad : pad + t, :] = x + padded[:, pad: pad + t, :] = x elif side == "after": padded[:, :t, :] = x return padded @staticmethod def xfade_and_unfold(y, target, overlap): - """Applies a crossfade and unfolds into a 1d array. Args: y (ndarry) : Batched sequences of audio samples From 7c72562fe779261fec4de4161d02a05c7babaa12 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Thu, 22 Oct 2020 13:22:50 +0200 Subject: [PATCH 32/98] fix travis + pylint tests --- .travis/script | 3 ++- TTS/vocoder/models/wavernn.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis/script b/.travis/script index 0c24a221..0860f9cf 100755 --- a/.travis/script +++ b/.travis/script @@ -17,5 +17,6 @@ fi if [[ "$TEST_SUITE" == "testscripts" ]]; then # test model training scripts ./tests/test_tts_train.sh - ./tests/test_vocoder_train.sh + ./tests/test_vocoder_gan_train.sh + ./tests/test_vocoder_wavernn_train.sh fi diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 9b151cac..8a45d9e3 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -225,7 +225,7 @@ class WaveRNN(nn.Module): def forward(self, x, mels): bsize = x.size(0) - h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) + h1 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) h2 = torch.zeros(1, bsize, self.rnn_dims).to(x.device) mels, aux = self.upsample(mels) From bef3f2020bfb3c4ab24bc09030792792295824dc Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 09:45:37 +0100 Subject: [PATCH 33/98] compute audio feat on dataload --- TTS/bin/train_wavernn_vocoder.py | 175 ++++++++++++------------ TTS/vocoder/configs/wavernn_config.json | 143 +++++++++---------- TTS/vocoder/datasets/wavernn_dataset.py | 68 ++++++--- TTS/vocoder/models/wavernn.py | 60 ++++---- 4 files changed, 243 insertions(+), 203 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 66a7c913..91a62cbe 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -29,8 +29,8 @@ from TTS.utils.generic_utils import ( from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( find_feat_files, - load_wav_feat_data, - preprocess_wav_files, + load_wav_data, + load_wav_feat_data ) from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss from TTS.vocoder.utils.generic_utils import setup_wavernn @@ -41,15 +41,16 @@ use_cuda, num_gpus = setup_torch_training_env(True, True) def setup_loader(ap, is_val=False, verbose=False): - if is_val and not CONFIG.run_eval: + if is_val and not c.run_eval: loader = None else: dataset = WaveRNNDataset(ap=ap, items=eval_data if is_val else train_data, - seq_len=CONFIG.seq_len, + seq_len=c.seq_len, hop_len=ap.hop_length, - pad=CONFIG.padding, - mode=CONFIG.mode, + pad=c.padding, + mode=c.mode, + mulaw=c.mulaw, is_training=not is_val, verbose=verbose, ) @@ -57,10 +58,10 @@ def setup_loader(ap, is_val=False, verbose=False): loader = DataLoader(dataset, shuffle=True, collate_fn=dataset.collate, - batch_size=CONFIG.batch_size, - num_workers=CONFIG.num_val_loader_workers + batch_size=c.batch_size, + num_workers=c.num_val_loader_workers if is_val - else CONFIG.num_loader_workers, + else c.num_loader_workers, pin_memory=True, ) return loader @@ -89,9 +90,9 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): keep_avg = KeepAverage() if use_cuda: batch_n_iter = int(len(data_loader.dataset) / - (CONFIG.batch_size * num_gpus)) + (c.batch_size * num_gpus)) else: - batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) + batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() c_logger.print_train_start() # train loop @@ -102,9 +103,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): loader_time = time.time() - end_time global_step += 1 - ################## - # MODEL TRAINING # - ################## y_hat = model(x_input, mels) if isinstance(model.mode, int): @@ -112,7 +110,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): else: y_coarse = y_coarse.float() y_coarse = y_coarse.unsqueeze(-1) - # m_scaled, _ = model.upsample(m) # compute losses loss = criterion(y_hat, y_coarse) @@ -120,11 +117,11 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): raise RuntimeError(" [!] None loss. Exiting ...") optimizer.zero_grad() loss.backward() - if CONFIG.grad_clip > 0: + if c.grad_clip > 0: torch.nn.utils.clip_grad_norm_( - model.parameters(), CONFIG.grad_clip) - + model.parameters(), c.grad_clip) optimizer.step() + if scheduler is not None: scheduler.step() @@ -144,7 +141,7 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): keep_avg.update_values(update_train_values) # print training stats - if global_step % CONFIG.print_step == 0: + if global_step % c.print_step == 0: log_dict = {"step_time": [step_time, 2], "loader_time": [loader_time, 4], "current_lr": cur_lr, @@ -164,8 +161,8 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): tb_logger.tb_train_iter_stats(global_step, iter_stats) # save checkpoint - if global_step % CONFIG.save_step == 0: - if CONFIG.checkpoint: + if global_step % c.save_step == 0: + if c.checkpoint: # save model save_checkpoint(model, optimizer, @@ -180,28 +177,30 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): ) # synthesize a full voice - wav_path = train_data[random.randrange(0, len(train_data))][0] + rand_idx = random.randrange(0, len(train_data)) + wav_path = train_data[rand_idx] if not isinstance( + train_data[rand_idx], (tuple, list)) else train_data[rand_idx][0] wav = ap.load_wav(wav_path) ground_mel = ap.melspectrogram(wav) sample_wav = model.generate(ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, + c.batched, + c.target_samples, + c.overlap_samples, + use_cuda ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms figures = {"train/ground_truth": plot_spectrogram(ground_mel.T), - "train/prediction": plot_spectrogram(predict_mel.T), + "train/prediction": plot_spectrogram(predict_mel.T) } + tb_logger.tb_train_figures(global_step, figures) # Sample audio tb_logger.tb_train_audios( global_step, { - "train/audio": sample_wav}, CONFIG.audio["sample_rate"] + "train/audio": sample_wav}, c.audio["sample_rate"] ) - - tb_logger.tb_train_figures(global_step, figures) end_time = time.time() # print epoch stats @@ -259,34 +258,35 @@ def evaluate(model, criterion, ap, global_step, epoch): keep_avg.update_values(update_eval_values) # print eval stats - if CONFIG.print_eval: + if c.print_eval: c_logger.print_eval_step( num_iter, loss_dict, keep_avg.avg_values) - if epoch % CONFIG.test_every_epochs == 0 and epoch != 0: - # synthesize a part of data - wav_path = eval_data[random.randrange(0, len(eval_data))][0] + if epoch % c.test_every_epochs == 0 and epoch != 0: + # synthesize a full voice + rand_idx = random.randrange(0, len(eval_data)) + wav_path = eval_data[rand_idx] if not isinstance( + eval_data[rand_idx], (tuple, list)) else eval_data[rand_idx][0] wav = ap.load_wav(wav_path) - ground_mel = ap.melspectrogram(wav[:22000]) + ground_mel = ap.melspectrogram(wav) sample_wav = model.generate(ground_mel, - CONFIG.batched, - CONFIG.target_samples, - CONFIG.overlap_samples, + c.batched, + c.target_samples, + c.overlap_samples, use_cuda ) predict_mel = ap.melspectrogram(sample_wav) - # compute spectrograms - figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T), - } - # Sample audio tb_logger.tb_eval_audios( global_step, { - "eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + "eval/audio": sample_wav}, c.audio["sample_rate"] ) + # compute spectrograms + figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T) + } tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) @@ -299,53 +299,62 @@ def main(args): # pylint: disable=redefined-outer-name global train_data, eval_data # setup audio processor - ap = AudioProcessor(**CONFIG.audio) + ap = AudioProcessor(**c.audio) - print(f" > Loading wavs from: {CONFIG.data_path}") - if CONFIG.feature_path is not None: - print(f" > Loading features from: {CONFIG.feature_path}") + # print(f" > Loading wavs from: {c.data_path}") + # if c.feature_path is not None: + # print(f" > Loading features from: {c.feature_path}") + # eval_data, train_data = load_wav_feat_data( + # c.data_path, c.feature_path, c.eval_split_size + # ) + # else: + # mel_feat_path = os.path.join(OUT_PATH, "mel") + # feat_data = find_feat_files(mel_feat_path) + # if feat_data: + # print(f" > Loading features from: {mel_feat_path}") + # eval_data, train_data = load_wav_feat_data( + # c.data_path, mel_feat_path, c.eval_split_size + # ) + # else: + # print(" > No feature data found. Preprocessing...") + # # preprocessing feature data from given wav files + # preprocess_wav_files(OUT_PATH, CONFIG, ap) + # eval_data, train_data = load_wav_feat_data( + # c.data_path, mel_feat_path, c.eval_split_size + # ) + + print(f" > Loading wavs from: {c.data_path}") + if c.feature_path is not None: + print(f" > Loading features from: {c.feature_path}") eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size - ) + c.data_path, c.feature_path, c.eval_split_size) else: - mel_feat_path = os.path.join(OUT_PATH, "mel") - feat_data = find_feat_files(mel_feat_path) - if feat_data: - print(f" > Loading features from: {mel_feat_path}") - eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size - ) - else: - print(" > No feature data found. Preprocessing...") - # preprocessing feature data from given wav files - preprocess_wav_files(OUT_PATH, CONFIG, ap) - eval_data, train_data = load_wav_feat_data( - CONFIG.data_path, mel_feat_path, CONFIG.eval_split_size - ) + eval_data, train_data = load_wav_data( + c.data_path, c.eval_split_size) # setup model - model_wavernn = setup_wavernn(CONFIG) + model_wavernn = setup_wavernn(c) # define train functions - if CONFIG.mode == "mold": + if c.mode == "mold": criterion = discretized_mix_logistic_loss - elif CONFIG.mode == "gauss": + elif c.mode == "gauss": criterion = gaussian_loss - elif isinstance(CONFIG.mode, int): + elif isinstance(c.mode, int): criterion = torch.nn.CrossEntropyLoss() if use_cuda: model_wavernn.cuda() - if isinstance(CONFIG.mode, int): + if isinstance(c.mode, int): criterion.cuda() - optimizer = RAdam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + optimizer = RAdam(model_wavernn.parameters(), lr=c.lr, weight_decay=0) scheduler = None - if "lr_scheduler" in CONFIG: - scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) - scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params) + if "lr_scheduler" in c: + scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) + scheduler = scheduler(optimizer, **c.lr_scheduler_params) # slow start for the first 5 epochs - # lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1) + # lr_lambda = lambda epoch: min(epoch / c.warmup_steps, 1) # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) # restore any checkpoint @@ -366,7 +375,7 @@ def main(args): # pylint: disable=redefined-outer-name # retore only matching layers. print(" > Partial model initialization...") model_dict = model_wavernn.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model_wavernn.load_state_dict(model_dict) print(" > Model restored from step %d" % @@ -386,11 +395,10 @@ def main(args): # pylint: disable=redefined-outer-name best_loss = float("inf") global_step = args.restore_step - for epoch in range(0, CONFIG.epochs): - c_logger.print_epoch_start(epoch, CONFIG.epochs) - _, global_step = train( - model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch - ) + for epoch in range(0, c.epochs): + c_logger.print_epoch_start(epoch, c.epochs) + _, global_step = train(model_wavernn, optimizer, + criterion, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate( model_wavernn, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) @@ -462,14 +470,14 @@ if __name__ == "__main__": print(f" > Training continues for {args.restore_path}") # setup output paths and read configs - CONFIG = load_config(args.config_path) + c = load_config(args.config_path) # check_config(c) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path if args.continue_path == "": OUT_PATH = create_experiment_folder( - CONFIG.output_path, CONFIG.run_name, args.debug + c.output_path, c.run_name, args.debug ) AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") @@ -483,7 +491,7 @@ if __name__ == "__main__": new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() copy_config_file( - args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + args.config_path, os.path.join(OUT_PATH, "c.json"), new_fields ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) @@ -492,8 +500,7 @@ if __name__ == "__main__": tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text("model-description", - CONFIG["run_description"], 0) + tb_logger.tb_add_text("model-description", c["run_description"], 0) try: main(args) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 8e6a8c32..9a9fbdae 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,94 +1,97 @@ { "run_name": "wavernn_test", "run_description": "wavernn_test training", - - // AUDIO PARAMETERS - "audio":{ - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. + +// AUDIO PARAMETERS + "audio": { + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. // Silence trimming - "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - + "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. - + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, - - // Generating / Synthesizing - "batched": true, - "target_samples": 11000, // target number of samples to be generated in each batch entry - "overlap_samples": 550, // number of samples for crossfading between batches - + +// Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches // DISTRIBUTED TRAINING // "distributed":{ // "backend": "nccl", // "url": "tcp:\/\/localhost:54321" // }, - - // MODEL PARAMETERS - "use_aux_net": true, - "use_upsample_net": true, - "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length - "seq_len": 1280, // has to be devideable by hop_length - "mode": "mold", // mold [string], gauss [string], bits [int] - "mulaw": false, // apply mulaw if mode is bits - "padding": 2, // pad the input for resnet to see wider input length - // DATASET - //"use_gta": true, // use computed gta features from the tts model - "data_path": "path/to/wav/files", // path containing training wav files - "feature_path": null, // path containing computed features from wav files if null compute them +// MODEL MODE + "mode": 10, // mold [string], gauss [string], bits [int] + "mulaw": true, // apply mulaw if mode is bits + +// MODEL PARAMETERS + "wavernn_model_params": { + "rnn_dims": 512, + "fc_dims": 512, + "compute_dims": 128, + "res_out_dims": 128, + "num_res_blocks": 10, + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8] // this needs to correctly factorise hop_length + }, + +// DATASET + //"use_gta": true, // use computed gta features from the tts model + "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech", // path containing training wav files + "feature_path": null, // path containing computed features from wav files if null compute them + "seq_len": 1280, // has to be devideable by hop_length + "padding": 2, // pad the input for resnet to see wider input length + +// TRAINING + "batch_size": 64, // Batch size for training. + "epochs": 10000, // total number of epochs to train. - // TRAINING - "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. - "epochs": 10000, // total number of epochs to train. - - // VALIDATION +// VALIDATION "run_eval": true, - "test_every_epochs": 10, // Test after set number of epochs (Test every 20 epochs for example) - - // OPTIMIZER - "grad_clip": 4, // apply gradient clipping if > 0 - "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "test_every_epochs": 10, // Test after set number of epochs (Test every 10 epochs for example) + +// OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "lr_scheduler_params": { "gamma": 0.5, "milestones": [200000, 400000, 600000] }, - "lr": 1e-4, // initial learning rate - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. - "print_eval": false, // If True, it prints loss values for each step in eval run. - "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 50, // number of samples for testing - - // PATHS + "lr": 1e-4, // initial learning rate + +// TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + +// DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 50, // number of samples for testing + +// PATHS "output_path": "output/training/path" } - diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 194344a9..3dbb2194 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,11 +1,13 @@ import torch import numpy as np from torch.utils.data import Dataset +from multiprocessing import Manager class WaveRNNDataset(Dataset): """ - WaveRNN Dataset searchs for all the wav files under root path. + WaveRNN Dataset searchs for all the wav files under root path + and converts them to acoustic features on the fly. """ def __init__(self, @@ -15,16 +17,19 @@ class WaveRNNDataset(Dataset): hop_len, pad, mode, + mulaw, is_training=True, verbose=False, ): self.ap = ap + self.compute_feat = not isinstance(items[0], (tuple, list)) self.item_list = items self.seq_len = seq_len self.hop_len = hop_len self.pad = pad self.mode = mode + self.mulaw = mulaw self.is_training = is_training self.verbose = verbose @@ -36,22 +41,47 @@ class WaveRNNDataset(Dataset): return item def load_item(self, index): - wavpath, feat_path = self.item_list[index] - m = np.load(feat_path.replace("/quant/", "/mel/")) - # x = self.wav_cache[index] - if m.shape[-1] < 5: - print(" [!] Instance is too short! : {}".format(wavpath)) - self.item_list[index] = self.item_list[index + 1] - feat_path = self.item_list[index] - m = np.load(feat_path.replace("/quant/", "/mel/")) - if self.mode in ["gauss", "mold"]: - # x = np.load(feat_path.replace("/mel/", "/quant/")) - x = self.ap.load_wav(wavpath) - elif isinstance(self.mode, int): - x = np.load(feat_path.replace("/mel/", "/quant/")) + """ + load (audio, feat) couple if feature_path is set + else compute it on the fly + """ + if self.compute_feat: + + wavpath = self.item_list[index] + audio = self.ap.load_wav(wavpath) + mel = self.ap.melspectrogram(audio) + + if mel.shape[-1] < 5: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + audio = self.ap.load_wav(wavpath) + mel = self.ap.melspectrogram(audio) + if self.mode in ["gauss", "mold"]: + x_input = audio + elif isinstance(self.mode, int): + x_input = (self.ap.mulaw_encode(audio, qc=self.mode) + if self.mulaw else self.ap.quantize(audio, bits=self.mode)) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + else: - raise RuntimeError("Unknown dataset mode - ", self.mode) - return m, x + + wavpath, feat_path = self.item_list[index] + mel = np.load(feat_path.replace("/quant/", "/mel/")) + + if mel.shape[-1] < 5: + print(" [!] Instance is too short! : {}".format(wavpath)) + self.item_list[index] = self.item_list[index + 1] + feat_path = self.item_list[index] + mel = np.load(feat_path.replace("/quant/", "/mel/")) + if self.mode in ["gauss", "mold"]: + x_input = self.ap.load_wav(wavpath) + elif isinstance(self.mode, int): + x_input = np.load(feat_path.replace("/mel/", "/quant/")) + else: + raise RuntimeError("Unknown dataset mode - ", self.mode) + + return mel, x_input def collate(self, batch): mel_win = self.seq_len // self.hop_len + 2 * self.pad @@ -79,10 +109,8 @@ class WaveRNNDataset(Dataset): elif isinstance(self.mode, int): coarse = np.stack(coarse).astype(np.int64) coarse = torch.LongTensor(coarse) - x_input = ( - 2 * coarse[:, : self.seq_len].float() / - (2 ** self.mode - 1.0) - 1.0 - ) + x_input = (2 * coarse[:, : self.seq_len].float() / + (2 ** self.mode - 1.0) - 1.0) y_coarse = coarse[:, 1:] mels = torch.FloatTensor(mels) return x_input, mels, y_coarse diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 8a45d9e3..f771175c 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -36,14 +36,14 @@ class ResBlock(nn.Module): class MelResNet(nn.Module): - def __init__(self, res_blocks, in_dims, compute_dims, res_out_dims, pad): + def __init__(self, num_res_blocks, in_dims, compute_dims, res_out_dims, pad): super().__init__() k_size = pad * 2 + 1 self.conv_in = nn.Conv1d( in_dims, compute_dims, kernel_size=k_size, bias=False) self.batch_norm = nn.BatchNorm1d(compute_dims) self.layers = nn.ModuleList() - for _ in range(res_blocks): + for _ in range(num_res_blocks): self.layers.append(ResBlock(compute_dims)) self.conv_out = nn.Conv1d(compute_dims, res_out_dims, kernel_size=1) @@ -76,7 +76,7 @@ class UpsampleNetwork(nn.Module): feat_dims, upsample_scales, compute_dims, - res_blocks, + num_res_blocks, res_out_dims, pad, use_aux_net, @@ -87,7 +87,7 @@ class UpsampleNetwork(nn.Module): self.use_aux_net = use_aux_net if use_aux_net: self.resnet = MelResNet( - res_blocks, feat_dims, compute_dims, res_out_dims, pad + num_res_blocks, feat_dims, compute_dims, res_out_dims, pad ) self.resnet_stretch = Stretch2d(self.total_scale, 1) self.up_layers = nn.ModuleList() @@ -118,14 +118,14 @@ class UpsampleNetwork(nn.Module): class Upsample(nn.Module): def __init__( - self, scale, pad, res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net + self, scale, pad, num_res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net ): super().__init__() self.scale = scale self.pad = pad self.indent = pad * scale self.use_aux_net = use_aux_net - self.resnet = MelResNet(res_blocks, feat_dims, + self.resnet = MelResNet(num_res_blocks, feat_dims, compute_dims, res_out_dims, pad) def forward(self, m): @@ -147,23 +147,22 @@ class Upsample(nn.Module): class WaveRNN(nn.Module): - def __init__( - self, - rnn_dims, - fc_dims, - mode, - mulaw, - pad, - use_aux_net, - use_upsample_net, - upsample_factors, - feat_dims, - compute_dims, - res_out_dims, - res_blocks, - hop_length, - sample_rate, - ): + def __init__(self, + rnn_dims, + fc_dims, + mode, + mulaw, + pad, + use_aux_net, + use_upsample_net, + upsample_factors, + feat_dims, + compute_dims, + res_out_dims, + num_res_blocks, + hop_length, + sample_rate, + ): super().__init__() self.mode = mode self.mulaw = mulaw @@ -177,7 +176,7 @@ class WaveRNN(nn.Module): elif self.mode == "gauss": self.n_classes = 2 else: - raise RuntimeError(" > Unknown training mode") + raise RuntimeError("Unknown model mode value - ", self.mode) self.rnn_dims = rnn_dims self.aux_dims = res_out_dims // 4 @@ -192,7 +191,7 @@ class WaveRNN(nn.Module): feat_dims, upsample_factors, compute_dims, - res_blocks, + num_res_blocks, res_out_dims, pad, use_aux_net, @@ -201,7 +200,7 @@ class WaveRNN(nn.Module): self.upsample = Upsample( hop_length, pad, - res_blocks, + num_res_blocks, feat_dims, compute_dims, res_out_dims, @@ -260,7 +259,7 @@ class WaveRNN(nn.Module): x = F.relu(self.fc2(x)) return self.fc3(x) - def generate(self, mels, batched, target, overlap, use_cuda): + def generate(self, mels, batched, target, overlap, use_cuda=False): self.eval() device = 'cuda' if use_cuda else 'cpu' @@ -360,7 +359,9 @@ class WaveRNN(nn.Module): # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] - output[-20 * self.hop_length:] *= fade_out + + if wave_len > len(fade_out): + output[-20 * self.hop_length:] *= fade_out self.train() return output @@ -405,7 +406,8 @@ class WaveRNN(nn.Module): padding = target + 2 * overlap - remaining x = self.pad_tensor(x, padding, side="after") - folded = torch.zeros(num_folds, target + 2 * overlap, features).to(x.device) + folded = torch.zeros(num_folds, target + 2 * + overlap, features).to(x.device) # Get the values for the folded tensor for i in range(num_folds): From 1e646135ca6cbd09a8efbe4266d5fe2c8927e992 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 09:47:04 +0100 Subject: [PATCH 34/98] add model params to config --- TTS/vocoder/utils/generic_utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index c16fa1ae..f9fbba52 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -47,18 +47,18 @@ def setup_wavernn(c): MyModel = importlib.import_module("TTS.vocoder.models.wavernn") MyModel = getattr(MyModel, "WaveRNN") model = MyModel( - rnn_dims=512, - fc_dims=512, + rnn_dims=c.wavernn_model_params['rnn_dims'], + fc_dims=c.wavernn_model_params['fc_dims'], mode=c.mode, mulaw=c.mulaw, pad=c.padding, - use_aux_net=c.use_aux_net, - use_upsample_net=c.use_upsample_net, - upsample_factors=c.upsample_factors, - feat_dims=80, - compute_dims=128, - res_out_dims=128, - res_blocks=10, + use_aux_net=c.wavernn_model_params['use_aux_net'], + use_upsample_net=c.wavernn_model_params['use_upsample_net'], + upsample_factors=c.wavernn_model_params['upsample_factors'], + feat_dims=c.audio['num_mels'], + compute_dims=c.wavernn_model_params['compute_dims'], + res_out_dims=c.wavernn_model_params['res_out_dims'], + num_res_blocks=c.wavernn_model_params['num_res_blocks'], hop_length=c.audio["hop_length"], sample_rate=c.audio["sample_rate"], ) From 2ee47e9568cc042491f28c7379e4ad468a7b1ba5 Mon Sep 17 00:00:00 2001 From: sanjaesc Date: Sun, 25 Oct 2020 10:04:24 +0100 Subject: [PATCH 35/98] fix pylint once again --- TTS/bin/train_wavernn_vocoder.py | 1 - TTS/vocoder/datasets/wavernn_dataset.py | 1 - tests/test_vocoder_wavernn.py | 2 +- tests/test_vocoder_wavernn_datasets.py | 17 +++++++++-------- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 91a62cbe..61664a65 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -28,7 +28,6 @@ from TTS.utils.generic_utils import ( ) from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset from TTS.vocoder.datasets.preprocess import ( - find_feat_files, load_wav_data, load_wav_feat_data ) diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 3dbb2194..9c1ded96 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -1,7 +1,6 @@ import torch import numpy as np from torch.utils.data import Dataset -from multiprocessing import Manager class WaveRNNDataset(Dataset): diff --git a/tests/test_vocoder_wavernn.py b/tests/test_vocoder_wavernn.py index fdb338f9..ccd71c56 100644 --- a/tests/test_vocoder_wavernn.py +++ b/tests/test_vocoder_wavernn.py @@ -17,7 +17,7 @@ def test_wavernn(): feat_dims=80, compute_dims=128, res_out_dims=128, - res_blocks=10, + num_res_blocks=10, hop_length=256, sample_rate=22050, ) diff --git a/tests/test_vocoder_wavernn_datasets.py b/tests/test_vocoder_wavernn_datasets.py index 0f4e939a..a95e247a 100644 --- a/tests/test_vocoder_wavernn_datasets.py +++ b/tests/test_vocoder_wavernn_datasets.py @@ -23,7 +23,7 @@ test_quant_feat_path = os.path.join(test_data_path, "quant") ok_ljspeech = os.path.exists(test_data_path) -def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): +def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, mulaw, num_workers): """ run dataloader with given parameters and check conditions """ ap = AudioProcessor(**C.audio) @@ -42,6 +42,7 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): hop_len=hop_len, pad=pad, mode=mode, + mulaw=mulaw ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, @@ -78,13 +79,13 @@ def wavernn_dataset_case(batch_size, seq_len, hop_len, pad, mode, num_workers): def test_parametrized_wavernn_dataset(): ''' test dataloader with different parameters ''' params = [ - [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, 0], - [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", 4], - [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, 0], - [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, 0], - [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", 0], - [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, 2], - [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 10, True, 0], + [16, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, "mold", False, 4], + [1, C.audio['hop_length'] * 10, C.audio['hop_length'], 2, 9, False, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, 10, True, 0], + [1, C.audio['hop_length'], C.audio['hop_length'], 2, "mold", False, 0], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 4, 10, False, 2], + [1, C.audio['hop_length'] * 5, C.audio['hop_length'], 2, "mold", False, 0], ] for param in params: print(param) From 0becef4b58d34440737e65e37562b0b40d1d9054 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 26 Oct 2020 17:16:16 +0100 Subject: [PATCH 36/98] small updates --- TTS/bin/train_wavernn_vocoder.py | 1 - TTS/vocoder/configs/wavernn_config.json | 30 ++++++++++++------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 61664a65..90e30256 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -95,7 +95,6 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): end_time = time.time() c_logger.print_train_start() # train loop - print(" > Training", flush=True) for num_iter, data in enumerate(data_loader): start_time = time.time() x_input, mels, y_coarse = format_data(data) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 9a9fbdae..8f290b80 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,7 +1,7 @@ { "run_name": "wavernn_test", "run_description": "wavernn_test training", - + // AUDIO PARAMETERS "audio": { "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. @@ -29,7 +29,7 @@ "clip_norm": true, // clip normalized values into the range. "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, - + // Generating / Synthesizing "batched": true, "target_samples": 11000, // target number of samples to be generated in each batch entry @@ -39,11 +39,11 @@ // "backend": "nccl", // "url": "tcp:\/\/localhost:54321" // }, - -// MODEL MODE - "mode": 10, // mold [string], gauss [string], bits [int] + +// MODEL MODE + "mode": "mold", // mold [string], gauss [string], bits [int] "mulaw": true, // apply mulaw if mode is bits - + // MODEL PARAMETERS "wavernn_model_params": { "rnn_dims": 512, @@ -55,14 +55,14 @@ "use_upsample_net": true, "upsample_factors": [4, 8, 8] // this needs to correctly factorise hop_length }, - + // DATASET //"use_gta": true, // use computed gta features from the tts model - "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech", // path containing training wav files + "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/", // path containing training wav files "feature_path": null, // path containing computed features from wav files if null compute them "seq_len": 1280, // has to be devideable by hop_length "padding": 2, // pad the input for resnet to see wider input length - + // TRAINING "batch_size": 64, // Batch size for training. "epochs": 10000, // total number of epochs to train. @@ -70,7 +70,7 @@ // VALIDATION "run_eval": true, "test_every_epochs": 10, // Test after set number of epochs (Test every 10 epochs for example) - + // OPTIMIZER "grad_clip": 4, // apply gradient clipping if > 0 "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate @@ -79,19 +79,19 @@ "milestones": [200000, 400000, 600000] }, "lr": 1e-4, // initial learning rate - + // TENSORBOARD and LOGGING "print_step": 25, // Number of steps to log traning on console. "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - + // DATA LOADING "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 50, // number of samples for testing - + "eval_split_size": 50, // number of samples for testing + // PATHS - "output_path": "output/training/path" + "output_path": "/home/erogol/Models/LJSpeech/" } From 5903b3d4361b1aff5aadc6e1b0579a66ff515922 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 27 Oct 2020 12:14:27 +0100 Subject: [PATCH 37/98] update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0127e84d..2893036a 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ args, unknown_args = parser.parse_known_args() # Remove our arguments from argv so that setuptools doesn't see them sys.argv = [sys.argv[0]] + unknown_args -version = '0.0.5' +version = '0.0.6' # Adapted from https://github.com/pytorch/pytorch cwd = os.path.dirname(os.path.abspath(__file__)) From ff549cddbd515f7801fc2fcaf40f4767d58c3d55 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 27 Oct 2020 12:25:26 +0100 Subject: [PATCH 38/98] readme update --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index d2bfbeee..609349a5 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ TTS paper collection: https://github.com/erogol/TTS-papers ## TTS Performance

-"Mozilla*" and "Judy*" are our models. +"Mozilla*" and "Judy*" are our models. [Details...](https://github.com/mozilla/TTS/wiki/Mean-Opinion-Score-Results) ## Provided Models and Methods @@ -48,6 +48,7 @@ Vocoders: - MelGAN: [paper](https://arxiv.org/abs/1710.10467) - MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106) - GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646) +- WaveRNN: [origin][https://github.com/fatchord/WaveRNN/] You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers). From 3aeef5e83ce19ed83b2d54ed7273191fa55a7370 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 27 Oct 2020 12:28:03 +0100 Subject: [PATCH 39/98] update readme --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 609349a5..5b048c42 100644 --- a/README.md +++ b/README.md @@ -71,8 +71,8 @@ You can also help us implement more models. Some TTS related work can be found [ ## Main Requirements and Installation Highly recommended to use [miniconda](https://conda.io/miniconda.html) for easier installation. * python>=3.6 - * pytorch>=1.4.1 - * tensorflow>=2.2 + * pytorch>=1.5.0 + * tensorflow>=2.3 * librosa * tensorboard * tensorboardX @@ -164,7 +164,7 @@ To continue an old training run, use ```--continue_path```. For multi-GPU training use ```distribute.py```. It enables process based multi-GPU training where each process uses a single GPU. -```CUDA_VISIBLE_DEVICES="0,1,4" TTS/bin/distribute.py --config_path TTS/tts/configs/config.json``` +```CUDA_VISIBLE_DEVICES="0,1,4" TTS/bin/distribute.py --script train_tts.py --config_path TTS/tts/configs/config.json``` Each run creates a new output folder and ```config.json``` is copied under this folder. @@ -219,3 +219,4 @@ If you like to use TTS to try a new idea and like to share your experiments with - https://github.com/r9y9/tacotron_pytorch (Initial Tacotron architecture) - https://github.com/kan-bayashi/ParallelWaveGAN (vocoder library) - https://github.com/jaywalnut310/glow-tts (Original Glow-TTS implementation) +- https://github.com/fatchord/WaveRNN/ (WaveRNN implementation) From a6f564c8c8ded2f277264ae30e8a6aaa4c2ab4bc Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 27 Oct 2020 12:35:10 +0100 Subject: [PATCH 40/98] pylint fixes --- TTS/tts/layers/losses.py | 7 ++++--- TTS/vocoder/models/wavernn.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 8256c0f7..67503a76 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -7,6 +7,8 @@ from torch.nn import functional from TTS.tts.utils.generic_utils import sequence_mask +# pylint: disable=abstract-method Method +# relates https://github.com/pytorch/pytorch/issues/42305 class L1LossMasked(nn.Module): def __init__(self, seq_len_norm): super().__init__() @@ -145,9 +147,8 @@ class DifferentailSpectralLoss(nn.Module): target_diff = target[:, 1:] - target[:, :-1] if len(signature(self.loss_func).parameters) > 2: return self.loss_func(x_diff, target_diff, length-1) - else: - # if loss masking is not enabled - return self.loss_func(x_diff, target_diff) + # if loss masking is not enabled + return self.loss_func(x_diff, target_diff) class GuidedAttentionLoss(torch.nn.Module): diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index f771175c..638cbebc 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -16,7 +16,8 @@ from TTS.vocoder.utils.distribution import ( def stream(string, variables): sys.stdout.write(f"\r{string}" % variables) - +# pylint: disable=abstract-method +# relates https://github.com/pytorch/pytorch/issues/42305 class ResBlock(nn.Module): def __init__(self, dims): super().__init__() From f01502a9dbe65eaa5d6b9f796e9673bd08ff929d Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 27 Oct 2020 16:30:16 -0300 Subject: [PATCH 41/98] bug fix in glowTTS sythesize --- TTS/bin/synthesize.py | 10 +++++++--- TTS/tts/utils/generic_utils.py | 9 ++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index bb257548..64993754 100644 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -10,7 +10,7 @@ import time import torch -from TTS.tts.utils.generic_utils import setup_model +from TTS.tts.utils.generic_utils import setup_model, is_tacotron from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.utils.audio import AudioProcessor @@ -125,7 +125,8 @@ if __name__ == "__main__": model.eval() if args.use_cuda: model.cuda() - model.decoder.set_r(cp['r']) + if is_tacotron(C): + model.decoder.set_r(cp['r']) # load vocoder model if args.vocoder_path != "": @@ -153,7 +154,10 @@ if __name__ == "__main__": args.speaker_fileid = None if args.gst_style is None: - gst_style = C.gst['gst_style_input'] + if is_tacotron(C): + gst_style = C.gst['gst_style_input'] + else: + gst_style = None else: # check if gst_style string is a dict, if is dict convert else use string try: diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index 2361fa85..6f7949b2 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -28,7 +28,6 @@ def split_dataset(items): return items_eval, items return items[:eval_split_size], items[eval_split_size:] - # from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1 def sequence_mask(sequence_length, max_len=None): if max_len is None: @@ -50,7 +49,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): MyModel = importlib.import_module('TTS.tts.models.' + c.model.lower()) MyModel = getattr(MyModel, to_camel(c.model)) if c.model.lower() in "tacotron": - model = MyModel(num_chars=num_chars, + model = MyModel(num_chars=num_chars + getattr(c, "add_blank", False), num_speakers=num_speakers, r=c.r, postnet_output_dim=int(c.audio['fft_size'] / 2 + 1), @@ -77,7 +76,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): ddc_r=c.ddc_r, speaker_embedding_dim=speaker_embedding_dim) elif c.model.lower() == "tacotron2": - model = MyModel(num_chars=num_chars, + model = MyModel(num_chars=num_chars + getattr(c, "add_blank", False), num_speakers=num_speakers, r=c.r, postnet_output_dim=c.audio['num_mels'], @@ -103,7 +102,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): ddc_r=c.ddc_r, speaker_embedding_dim=speaker_embedding_dim) elif c.model.lower() == "glow_tts": - model = MyModel(num_chars=num_chars, + model = MyModel(num_chars=num_chars + getattr(c, "add_blank", False), hidden_channels=192, filter_channels=768, filter_channels_dp=256, @@ -131,7 +130,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): return model def is_tacotron(c): - return False if c['model'] == 'glow_tts' else True + return False if 'glow_tts' in c['model'] else True def check_config_tts(c): check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts'], restricted=True, val_type=str) From 9d0ae2bfb48aa7d95c6f236d8b9f841fb64a525f Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 28 Oct 2020 12:31:01 +0100 Subject: [PATCH 42/98] wavernn dataloader handling for short samples and mixed precision training --- TTS/bin/train_wavernn_vocoder.py | 54 +++++++++++++++++-------- TTS/vocoder/configs/wavernn_config.json | 11 ++--- TTS/vocoder/datasets/wavernn_dataset.py | 17 ++++---- 3 files changed, 53 insertions(+), 29 deletions(-) diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py index 90e30256..acc4b703 100644 --- a/TTS/bin/train_wavernn_vocoder.py +++ b/TTS/bin/train_wavernn_vocoder.py @@ -94,6 +94,7 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() c_logger.print_train_start() + scaler = torch.cuda.amp.GradScaler() # train loop for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -101,24 +102,43 @@ def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): loader_time = time.time() - end_time global_step += 1 - y_hat = model(x_input, mels) - - if isinstance(model.mode, int): - y_hat = y_hat.transpose(1, 2).unsqueeze(-1) - else: - y_coarse = y_coarse.float() - y_coarse = y_coarse.unsqueeze(-1) - - # compute losses - loss = criterion(y_hat, y_coarse) - if loss.item() is None: - raise RuntimeError(" [!] None loss. Exiting ...") optimizer.zero_grad() - loss.backward() - if c.grad_clip > 0: - torch.nn.utils.clip_grad_norm_( - model.parameters(), c.grad_clip) - optimizer.step() + + if c.mixed_precision: + # mixed precision training + with torch.cuda.amp.autocast(): + y_hat = model(x_input, mels) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y_coarse = y_coarse.float() + y_coarse = y_coarse.unsqueeze(-1) + # compute losses + loss = criterion(y_hat, y_coarse) + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + if c.grad_clip > 0: + torch.nn.utils.clip_grad_norm_( + model.parameters(), c.grad_clip) + scaler.step(optimizer) + scaler.update() + else: + # full precision training + y_hat = model(x_input, mels) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y_coarse = y_coarse.float() + y_coarse = y_coarse.unsqueeze(-1) + # compute losses + loss = criterion(y_hat, y_coarse) + if loss.item() is None: + raise RuntimeError(" [!] None loss. Exiting ...") + loss.backward() + if c.grad_clip > 0: + torch.nn.utils.clip_grad_norm_( + model.parameters(), c.grad_clip) + optimizer.step() if scheduler is not None: scheduler.step() diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json index 8f290b80..58667b69 100644 --- a/TTS/vocoder/configs/wavernn_config.json +++ b/TTS/vocoder/configs/wavernn_config.json @@ -1,6 +1,6 @@ { - "run_name": "wavernn_test", - "run_description": "wavernn_test training", + "run_name": "wavernn_librittts", + "run_description": "wavernn libritts training from LJSpeech model", // AUDIO PARAMETERS "audio": { @@ -10,7 +10,7 @@ "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "sample_rate": 24000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. // Silence trimming @@ -58,14 +58,15 @@ // DATASET //"use_gta": true, // use computed gta features from the tts model - "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/", // path containing training wav files + "data_path": "/home/erogol/Data/libritts/LibriTTS/train-clean-360/", // path containing training wav files "feature_path": null, // path containing computed features from wav files if null compute them "seq_len": 1280, // has to be devideable by hop_length "padding": 2, // pad the input for resnet to see wider input length // TRAINING - "batch_size": 64, // Batch size for training. + "batch_size": 256, // Batch size for training. "epochs": 10000, // total number of epochs to train. + "mixed_precision": true, // enable/ disable mixed precision training // VALIDATION "run_eval": true, diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 9c1ded96..257800b0 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -26,12 +26,15 @@ class WaveRNNDataset(Dataset): self.item_list = items self.seq_len = seq_len self.hop_len = hop_len + self.mel_len = seq_len // hop_len self.pad = pad self.mode = mode self.mulaw = mulaw self.is_training = is_training self.verbose = verbose + assert self.seq_len % self.hop_len == 0 + def __len__(self): return len(self.item_list) @@ -48,13 +51,12 @@ class WaveRNNDataset(Dataset): wavpath = self.item_list[index] audio = self.ap.load_wav(wavpath) + min_audio_len = 2 * self.seq_len + (2 * self.pad * self.hop_len) + if audio.shape[0] < min_audio_len: + print(" [!] Instance is too short! : {}".format(wavpath)) + audio = np.pad(audio, [0, min_audio_len - audio.shape[0] + self.hop_len]) mel = self.ap.melspectrogram(audio) - if mel.shape[-1] < 5: - print(" [!] Instance is too short! : {}".format(wavpath)) - self.item_list[index] = self.item_list[index + 1] - audio = self.ap.load_wav(wavpath) - mel = self.ap.melspectrogram(audio) if self.mode in ["gauss", "mold"]: x_input = audio elif isinstance(self.mode, int): @@ -68,7 +70,7 @@ class WaveRNNDataset(Dataset): wavpath, feat_path = self.item_list[index] mel = np.load(feat_path.replace("/quant/", "/mel/")) - if mel.shape[-1] < 5: + if mel.shape[-1] < self.mel_len + 2 * self.pad: print(" [!] Instance is too short! : {}".format(wavpath)) self.item_list[index] = self.item_list[index + 1] feat_path = self.item_list[index] @@ -80,12 +82,13 @@ class WaveRNNDataset(Dataset): else: raise RuntimeError("Unknown dataset mode - ", self.mode) - return mel, x_input + return mel, x_input, wavpath def collate(self, batch): mel_win = self.seq_len // self.hop_len + 2 * self.pad max_offsets = [x[0].shape[-1] - (mel_win + 2 * self.pad) for x in batch] + mel_offsets = [np.random.randint(0, offset) for offset in max_offsets] sig_offsets = [(offset + self.pad) * self.hop_len for offset in mel_offsets] From 9cef923d991f57b697f749295dc242e5c745ed1e Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 28 Oct 2020 15:24:18 +0100 Subject: [PATCH 43/98] ssim loss for tacotron models --- TTS/tts/configs/config.json | 10 +++- TTS/tts/layers/losses.py | 100 ++++++++++++++++++++++++++++----- TTS/tts/utils/generic_utils.py | 11 +++- tests/test_layers.py | 71 ++++++++++++++++++++++- 4 files changed, 174 insertions(+), 18 deletions(-) diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json index 1b63b037..4d3e2674 100644 --- a/TTS/tts/configs/config.json +++ b/TTS/tts/configs/config.json @@ -69,10 +69,14 @@ // LOSS SETTINGS "loss_masking": true, // enable / disable loss masking against the sequence padding. - "decoder_loss_alpha": 0.5, // decoder loss weight. If > 0, it is enabled - "postnet_loss_alpha": 0.25, // postnet loss weight. If > 0, it is enabled + "decoder_loss_alpha": 0.5, // original decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // original postnet loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + // VALIDATION "run_eval": true, diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 67503a76..10ee3905 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -5,6 +5,7 @@ from torch import nn from inspect import signature from torch.nn import functional from TTS.tts.utils.generic_utils import sequence_mask +from TTS.tts.utils.ssim import ssim # pylint: disable=abstract-method Method @@ -25,6 +26,10 @@ class L1LossMasked(nn.Module): class for each corresponding step. length: A Variable containing a LongTensor of size (batch,) which contains the length of each data in a batch. + Shapes: + x: B x T X D + target: B x T x D + length: B Returns: loss: An average loss value in range [0, 1] masked by the length. """ @@ -63,6 +68,10 @@ class MSELossMasked(nn.Module): class for each corresponding step. length: A Variable containing a LongTensor of size (batch,) which contains the length of each data in a batch. + Shapes: + x: B x T X D + target: B x T x D + length: B Returns: loss: An average loss value in range [0, 1] masked by the length. """ @@ -87,6 +96,33 @@ class MSELossMasked(nn.Module): return loss +class SSIMLoss(torch.nn.Module): + """SSIM loss as explained here https://en.wikipedia.org/wiki/Structural_similarity""" + def __init__(self): + super().__init__() + self.loss_func = ssim + + def forward(self, y_hat, y, length=None): + """ + Args: + y_hat (tensor): model prediction values. + y (tensor): target values. + length (tensor): length of each sample in a batch. + Shapes: + y_hat: B x T X D + y: B x T x D + length: B + Returns: + loss: An average loss value in range [0, 1] masked by the length. + """ + if length is not None: + m = sequence_mask(sequence_length=length, + max_len=y.size(1)).unsqueeze(2).float().to( + y_hat.device) + y_hat, y = y_hat * m, y * m + return 1 - self.loss_func(y_hat.unsqueeze(1), y.unsqueeze(1)) + + class AttentionEntropyLoss(nn.Module): # pylint: disable=R0201 def forward(self, align): @@ -118,6 +154,10 @@ class BCELossMasked(nn.Module): class for each corresponding step. length: A Variable containing a LongTensor of size (batch,) which contains the length of each data in a batch. + Shapes: + x: B x T + target: B x T + length: B Returns: loss: An average loss value in range [0, 1] masked by the length. """ @@ -142,13 +182,20 @@ class DifferentailSpectralLoss(nn.Module): super().__init__() self.loss_func = loss_func - def forward(self, x, target, length): + def forward(self, x, target, length=None): + """ + Shapes: + x: B x T + target: B x T + length: B + Returns: + loss: An average loss value in range [0, 1] masked by the length. + """ x_diff = x[:, 1:] - x[:, :-1] target_diff = target[:, 1:] - target[:, :-1] - if len(signature(self.loss_func).parameters) > 2: - return self.loss_func(x_diff, target_diff, length-1) - # if loss masking is not enabled - return self.loss_func(x_diff, target_diff) + if length is None: + return self.loss_func(x_diff, target_diff) + return self.loss_func(x_diff, target_diff, length-1) class GuidedAttentionLoss(torch.nn.Module): @@ -188,6 +235,7 @@ class GuidedAttentionLoss(torch.nn.Module): class TacotronLoss(torch.nn.Module): + """Collection of Tacotron set-up based on provided config.""" def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4): super(TacotronLoss, self).__init__() self.stopnet_pos_weight = stopnet_pos_weight @@ -195,6 +243,7 @@ class TacotronLoss(torch.nn.Module): self.diff_spec_alpha = c.diff_spec_alpha self.decoder_alpha = c.decoder_loss_alpha self.postnet_alpha = c.postnet_loss_alpha + self.ssim_alpha = c.ssim_alpha self.config = c # postnet and decoder loss @@ -205,12 +254,15 @@ class TacotronLoss(torch.nn.Module): else: self.criterion = nn.L1Loss() if c.model in ["Tacotron" ] else nn.MSELoss() - # differential spectral loss - if c.diff_spec_alpha > 0: - self.criterion_diff_spec = DifferentailSpectralLoss(loss_func=self.criterion) # guided attention loss if c.ga_alpha > 0: self.criterion_ga = GuidedAttentionLoss(sigma=ga_sigma) + # differential spectral loss + if c.postnet_diff_spec_alpha > 0 or c.decoder_diff_spec_alpha > 0: + self.criterion_diff_spec = DifferentailSpectralLoss(loss_func=self.criterion) + # ssim loss + if c.postnet_ssim_alpha > 0 or c.decoder_ssim_alpha > 0: + self.criterion_ssim = SSIMLoss() # stopnet loss # pylint: disable=not-callable self.criterion_st = BCELossMasked( @@ -221,6 +273,9 @@ class TacotronLoss(torch.nn.Module): alignments, alignment_lens, alignments_backwards, input_lens): return_dict = {} + # remove lengths if no masking is applied + if not self.config.loss_masking: + output_lens = None # decoder and postnet losses if self.config.loss_masking: if self.decoder_alpha > 0: @@ -285,11 +340,30 @@ class TacotronLoss(torch.nn.Module): loss += ga_loss * self.ga_alpha return_dict['ga_loss'] = ga_loss * self.ga_alpha - # differential spectral loss - if self.config.diff_spec_alpha > 0: - diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens) - loss += diff_spec_loss * self.diff_spec_alpha - return_dict['diff_spec_loss'] = diff_spec_loss + # decoder differential spectral loss + if self.config.decoder_diff_spec_alpha > 0: + decoder_diff_spec_loss = self.criterion_diff_spec(decoder_output, mel_input, output_lens) + loss += decoder_diff_spec_loss * self.decoder_diff_spec_alpha + return_dict['decoder_diff_spec_loss'] = decoder_diff_spec_loss + + # postnet differential spectral loss + if self.config.postnet_diff_spec_alpha > 0: + postnet_diff_spec_loss = self.criterion_diff_spec(postnet_output, mel_input, output_lens) + loss += postnet_diff_spec_loss * self.postnet_diff_spec_alpha + return_dict['postnet_diff_spec_loss'] = postnet_diff_spec_loss + + # decoder ssim loss + if self.config.decoder_ssim_alpha > 0: + decoder_ssim_loss = self.criterion_ssim(decoder_output, mel_input, output_lens) + loss += decoder_ssim_loss * self.postnet_ssim_alpha + return_dict['decoder_ssim_loss'] = decoder_ssim_loss + + # postnet ssim loss + if self.config.postnet_ssim_alpha > 0: + postnet_ssim_loss = self.criterion_ssim(postnet_output, mel_input, output_lens) + loss += postnet_ssim_loss * self.postnet_ssim_alpha + return_dict['postnet_ssim_loss'] = postnet_ssim_loss + return_dict['loss'] = loss return return_dict diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index 2361fa85..2c82611f 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -178,10 +178,19 @@ def check_config_tts(c): check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) check_argument('r', c, restricted=True, val_type=int, min_val=1) check_argument('gradual_training', c, restricted=False, val_type=list) - check_argument('loss_masking', c, restricted=True, val_type=bool) check_argument('apex_amp_level', c, restricted=False, val_type=str) # check_argument('grad_accum', c, restricted=True, val_type=int, min_val=1, max_val=100) + # loss parameters + check_argument('loss_masking', c, restricted=True, val_type=bool) + check_argument('decoder_loss_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('postnet_loss_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('postnet_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('decoder_diff_spec_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('decoder_ssim_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('postnet_ssim_alpha', c, restricted=True, val_type=float, min_val=0) + check_argument('ga_alpha', c, restricted=True, val_type=float, min_val=0) + # validation parameters check_argument('run_eval', c, restricted=True, val_type=bool) check_argument('test_delay_epochs', c, restricted=True, val_type=int, min_val=0) diff --git a/tests/test_layers.py b/tests/test_layers.py index 57be51e5..5426e195 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -2,7 +2,7 @@ import unittest import torch as T from TTS.tts.layers.tacotron import Prenet, CBHG, Decoder, Encoder -from TTS.tts.layers.losses import L1LossMasked +from TTS.tts.layers.losses import L1LossMasked, SSIMLoss from TTS.tts.utils.generic_utils import sequence_mask # pylint: disable=unused-variable @@ -149,3 +149,72 @@ class L1LossMaskedTests(unittest.TestCase): (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 0, "0 vs {}".format(output.item()) + + +class SSIMLossTests(unittest.TestCase): + def test_in_out(self): #pylint: disable=no-self-use + # test input == target + layer = SSIMLoss() + dummy_input = T.ones(4, 8, 128).float() + dummy_target = T.ones(4, 8, 128).float() + dummy_length = (T.ones(4) * 8).long() + output = layer(dummy_input, dummy_target, dummy_length) + assert output.item() == 0.0 + + # test input != target + dummy_input = T.ones(4, 8, 128).float() + dummy_target = T.zeros(4, 8, 128).float() + dummy_length = (T.ones(4) * 8).long() + output = layer(dummy_input, dummy_target, dummy_length) + assert abs(output.item() - 1.0) < 1e-4 , "1.0 vs {}".format(output.item()) + + # test if padded values of input makes any difference + dummy_input = T.ones(4, 8, 128).float() + dummy_target = T.zeros(4, 8, 128).float() + dummy_length = (T.arange(5, 9)).long() + mask = ( + (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + output = layer(dummy_input + mask, dummy_target, dummy_length) + assert abs(output.item() - 1.0) < 1e-4, "1.0 vs {}".format(output.item()) + + dummy_input = T.rand(4, 8, 128).float() + dummy_target = dummy_input.detach() + dummy_length = (T.arange(5, 9)).long() + mask = ( + (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + output = layer(dummy_input + mask, dummy_target, dummy_length) + assert output.item() == 0, "0 vs {}".format(output.item()) + + # seq_len_norm = True + # test input == target + layer = L1LossMasked(seq_len_norm=True) + dummy_input = T.ones(4, 8, 128).float() + dummy_target = T.ones(4, 8, 128).float() + dummy_length = (T.ones(4) * 8).long() + output = layer(dummy_input, dummy_target, dummy_length) + assert output.item() == 0.0 + + # test input != target + dummy_input = T.ones(4, 8, 128).float() + dummy_target = T.zeros(4, 8, 128).float() + dummy_length = (T.ones(4) * 8).long() + output = layer(dummy_input, dummy_target, dummy_length) + assert output.item() == 1.0, "1.0 vs {}".format(output.item()) + + # test if padded values of input makes any difference + dummy_input = T.ones(4, 8, 128).float() + dummy_target = T.zeros(4, 8, 128).float() + dummy_length = (T.arange(5, 9)).long() + mask = ( + (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + output = layer(dummy_input + mask, dummy_target, dummy_length) + assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item()) + + dummy_input = T.rand(4, 8, 128).float() + dummy_target = dummy_input.detach() + dummy_length = (T.arange(5, 9)).long() + mask = ( + (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) + output = layer(dummy_input + mask, dummy_target, dummy_length) + assert output.item() == 0, "0 vs {}".format(output.item()) + From 59e1cf99d0dc5b00e276c72348737f2613f163e5 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 28 Oct 2020 18:30:00 +0100 Subject: [PATCH 44/98] config update and ssim implementation --- TTS/tts/configs/config.json | 1 + TTS/tts/utils/ssim.py | 75 +++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 TTS/tts/utils/ssim.py diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json index 4d3e2674..2cad69c3 100644 --- a/TTS/tts/configs/config.json +++ b/TTS/tts/configs/config.json @@ -76,6 +76,7 @@ "decoder_ssim_alpha": 0.5, // decoder ssim loss weight. If > 0, it is enabled "postnet_ssim_alpha": 0.25, // postnet ssim loss weight. If > 0, it is enabled "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "stopnet_pos_weight": 15.0, // pos class weight for stopnet loss since there are way more negative samples than positive samples. // VALIDATION diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py new file mode 100644 index 00000000..c370f5e5 --- /dev/null +++ b/TTS/tts/utils/ssim.py @@ -0,0 +1,75 @@ +# taken from https://github.com/Po-Hsun-Su/pytorch-ssim + +import torch +import torch.nn.functional as F +from torch.autograd import Variable +import numpy as np +from math import exp + +def gaussian(window_size, sigma): + gauss = torch.Tensor([exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)]) + return gauss/gauss.sum() + +def create_window(window_size, channel): + _1D_window = gaussian(window_size, 1.5).unsqueeze(1) + _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0) + window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous()) + return window + +def _ssim(img1, img2, window, window_size, channel, size_average = True): + mu1 = F.conv2d(img1, window, padding = window_size//2, groups = channel) + mu2 = F.conv2d(img2, window, padding = window_size//2, groups = channel) + + mu1_sq = mu1.pow(2) + mu2_sq = mu2.pow(2) + mu1_mu2 = mu1*mu2 + + sigma1_sq = F.conv2d(img1*img1, window, padding = window_size//2, groups = channel) - mu1_sq + sigma2_sq = F.conv2d(img2*img2, window, padding = window_size//2, groups = channel) - mu2_sq + sigma12 = F.conv2d(img1*img2, window, padding = window_size//2, groups = channel) - mu1_mu2 + + C1 = 0.01**2 + C2 = 0.03**2 + + ssim_map = ((2*mu1_mu2 + C1)*(2*sigma12 + C2))/((mu1_sq + mu2_sq + C1)*(sigma1_sq + sigma2_sq + C2)) + + if size_average: + return ssim_map.mean() + else: + return ssim_map.mean(1).mean(1).mean(1) + +class SSIM(torch.nn.Module): + def __init__(self, window_size = 11, size_average = True): + super(SSIM, self).__init__() + self.window_size = window_size + self.size_average = size_average + self.channel = 1 + self.window = create_window(window_size, self.channel) + + def forward(self, img1, img2): + (_, channel, _, _) = img1.size() + + if channel == self.channel and self.window.data.type() == img1.data.type(): + window = self.window + else: + window = create_window(self.window_size, channel) + + if img1.is_cuda: + window = window.cuda(img1.get_device()) + window = window.type_as(img1) + + self.window = window + self.channel = channel + + + return _ssim(img1, img2, window, self.window_size, channel, self.size_average) + +def ssim(img1, img2, window_size = 11, size_average = True): + (_, channel, _, _) = img1.size() + window = create_window(window_size, channel) + + if img1.is_cuda: + window = window.cuda(img1.get_device()) + window = window.type_as(img1) + + return _ssim(img1, img2, window, window_size, channel, size_average) \ No newline at end of file From e49cc3bbcdd9d82e3556b1b2b59cc98a1742984e Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 28 Oct 2020 18:34:34 +0100 Subject: [PATCH 45/98] bug fix --- TTS/tts/layers/losses.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 10ee3905..efd0c2cb 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -240,10 +240,12 @@ class TacotronLoss(torch.nn.Module): super(TacotronLoss, self).__init__() self.stopnet_pos_weight = stopnet_pos_weight self.ga_alpha = c.ga_alpha - self.diff_spec_alpha = c.diff_spec_alpha + self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha + self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha self.decoder_alpha = c.decoder_loss_alpha self.postnet_alpha = c.postnet_loss_alpha - self.ssim_alpha = c.ssim_alpha + self.decoder_ssim_alpha = c.decoder_ssim_alpha + self.postnet_ssim_alpha = c.postnet_ssim_alpha self.config = c # postnet and decoder loss From fdaed45f58712427067c433d981ea270bd5f1d63 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 28 Oct 2020 18:40:54 +0100 Subject: [PATCH 46/98] optional loss masking for stoptoken predictor --- TTS/tts/layers/losses.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index efd0c2cb..f26cb884 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -163,14 +163,20 @@ class BCELossMasked(nn.Module): """ # mask: (batch, max_len, 1) target.requires_grad = False - mask = sequence_mask(sequence_length=length, - max_len=target.size(1)).float() + if length is not None: + mask = sequence_mask(sequence_length=length, + max_len=target.size(1)).float() + x = x * mask + target = target * mask + num_items = mask.sum() + else: + num_items = torch.numel(x) loss = functional.binary_cross_entropy_with_logits( - x * mask, - target * mask, + x, + target, pos_weight=self.pos_weight, reduction='sum') - loss = loss / mask.sum() + loss = loss / num_items return loss From e723b99888008b7b28a1e5fb50c331b1e314a273 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 16 Oct 2020 16:12:38 +0200 Subject: [PATCH 47/98] handle distributed model as saving --- TTS/bin/train_glow_tts.py | 2 +- TTS/tts/utils/io.py | 8 ++++++-- TTS/{tts => }/utils/distribute.py | 0 TTS/vocoder/utils/io.py | 5 ++++- 4 files changed, 11 insertions(+), 4 deletions(-) rename TTS/{tts => }/utils/distribute.py (100%) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index f4d04abb..535bf8fd 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -186,7 +186,7 @@ def train(model, criterion, optimizer, scheduler, loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, o_dur_log, o_total_dur, text_lengths) - # backward pass + # backward pass - DISTRIBUTED if amp is not None: with amp.scale_loss(loss_dict['loss'], optimizer) as scaled_loss: scaled_loss.backward() diff --git a/TTS/tts/utils/io.py b/TTS/tts/utils/io.py index 18f83746..f84445d9 100644 --- a/TTS/tts/utils/io.py +++ b/TTS/tts/utils/io.py @@ -6,6 +6,7 @@ import pickle as pickle_tts from TTS.utils.io import RenamingUnpickler + def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False): try: state = torch.load(checkpoint_path, map_location=torch.device('cpu')) @@ -25,9 +26,12 @@ def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False): def save_model(model, optimizer, current_step, epoch, r, output_path, amp_state_dict=None, **kwargs): - new_state_dict = model.state_dict() + if hasattr(model, 'module'): + model_state = model.module.state_dict() + else: + model_state = model.state_dict() state = { - 'model': new_state_dict, + 'model': model_state, 'optimizer': optimizer.state_dict() if optimizer is not None else None, 'step': current_step, 'epoch': epoch, diff --git a/TTS/tts/utils/distribute.py b/TTS/utils/distribute.py similarity index 100% rename from TTS/tts/utils/distribute.py rename to TTS/utils/distribute.py diff --git a/TTS/vocoder/utils/io.py b/TTS/vocoder/utils/io.py index 640334f1..c33d2cb9 100644 --- a/TTS/vocoder/utils/io.py +++ b/TTS/vocoder/utils/io.py @@ -20,7 +20,10 @@ def load_checkpoint(model, checkpoint_path, use_cuda=False): def save_model(model, optimizer, scheduler, model_disc, optimizer_disc, scheduler_disc, current_step, epoch, output_path, **kwargs): - model_state = model.state_dict() + if hasattr(model, 'module'): + model_state = model.module.state_dict() + else: + model_state = model.state_dict() model_disc_state = model_disc.state_dict()\ if model_disc is not None else None optimizer_state = optimizer.state_dict()\ From ac57eea9284a7179961602062d7a1a9cf8cd0f1c Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 16 Oct 2020 16:34:07 +0200 Subject: [PATCH 48/98] add wavegrad to vocoder generators --- TTS/vocoder/utils/generic_utils.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index f9fbba52..d0eb0657 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -70,7 +70,7 @@ def setup_generator(c): MyModel = importlib.import_module('TTS.vocoder.models.' + c.generator_model.lower()) MyModel = getattr(MyModel, to_camel(c.generator_model)) - if c.generator_model in 'melgan_generator': + if c.generator_model.lower() in 'melgan_generator': model = MyModel( in_channels=c.audio['num_mels'], out_channels=1, @@ -81,7 +81,7 @@ def setup_generator(c): num_res_blocks=c.generator_model_params['num_res_blocks']) if c.generator_model in 'melgan_fb_generator': pass - if c.generator_model in 'multiband_melgan_generator': + if c.generator_model.lower() in 'multiband_melgan_generator': model = MyModel( in_channels=c.audio['num_mels'], out_channels=4, @@ -90,7 +90,7 @@ def setup_generator(c): upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'fullband_melgan_generator': + if c.generator_model.lower() in 'fullband_melgan_generator': model = MyModel( in_channels=c.audio['num_mels'], out_channels=1, @@ -99,7 +99,7 @@ def setup_generator(c): upsample_factors=c.generator_model_params['upsample_factors'], res_kernel=3, num_res_blocks=c.generator_model_params['num_res_blocks']) - if c.generator_model in 'parallel_wavegan_generator': + if c.generator_model.lower() in 'parallel_wavegan_generator': model = MyModel( in_channels=1, out_channels=1, @@ -114,6 +114,16 @@ def setup_generator(c): bias=True, use_weight_norm=True, upsample_factors=c.generator_model_params['upsample_factors']) + if c.generator_model.lower() in 'wavegrad': + model = MyModel( + in_channels=c['audio']['num_mels'], + out_channels=1, + x_conv_channels=c['model_params']['x_conv_channels'], + c_conv_channels=c['model_params']['c_conv_channels'], + dblock_out_channels=c['model_params']['dblock_out_channels'], + ublock_out_channels=c['model_params']['ublock_out_channels'], + upsample_factors=c['model_params']['upsample_factors'], + upsample_dilations=c['model_params']['upsample_dilations']) return model From e02cd6a22014cc328183e3f0f5f58822fc2d98fb Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 16 Oct 2020 16:35:25 +0200 Subject: [PATCH 49/98] initial wavegrad layers model and trainig script --- TTS/bin/train_wavegrad.py | 490 +++++++++++++++++++++ TTS/vocoder/configs/wavegrad_libritts.json | 103 +++++ TTS/vocoder/datasets/wavegrad_dataset.py | 113 +++++ TTS/vocoder/layers/wavegrad.py | 150 +++++++ TTS/vocoder/models/wavegrad.py | 131 ++++++ 5 files changed, 987 insertions(+) create mode 100644 TTS/bin/train_wavegrad.py create mode 100644 TTS/vocoder/configs/wavegrad_libritts.json create mode 100644 TTS/vocoder/datasets/wavegrad_dataset.py create mode 100644 TTS/vocoder/layers/wavegrad.py create mode 100644 TTS/vocoder/models/wavegrad.py diff --git a/TTS/bin/train_wavegrad.py b/TTS/bin/train_wavegrad.py new file mode 100644 index 00000000..469df638 --- /dev/null +++ b/TTS/bin/train_wavegrad.py @@ -0,0 +1,490 @@ +import argparse +import glob +import os +import sys +import time +import traceback +from inspect import signature + +import torch +from torch.utils.data import DataLoader +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data.distributed import DistributedSampler + +from TTS.utils.audio import AudioProcessor +from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.generic_utils import (KeepAverage, count_parameters, + create_experiment_folder, get_git_branch, + remove_experiment_folder, set_init_dict) +from TTS.utils.io import copy_config_file, load_config +from TTS.utils.radam import RAdam +from TTS.utils.tensorboard_logger import TensorboardLogger +from TTS.utils.training import setup_torch_training_env +from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.utils.distribute import init_distributed, reduce_tensor +from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss +from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, + setup_generator) +from TTS.vocoder.utils.io import save_best_model, save_checkpoint + +use_cuda, num_gpus = setup_torch_training_env(True, True) + + +def setup_loader(ap, is_val=False, verbose=False): + if is_val and not c.run_eval: + loader = None + else: + dataset = WaveGradDataset(ap=ap, + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=True, + use_noise_augment=False, + use_cache=c.use_cache, + verbose=verbose) + sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader(dataset, + batch_size=c.batch_size, + shuffle=False if num_gpus > 1 else True, + drop_last=False, + sampler=sampler, + num_workers=c.num_val_loader_workers + if is_val else c.num_loader_workers, + pin_memory=False) + return loader + + +def format_data(data): + # return a whole audio segment + m, y = data + if use_cuda: + m = m.cuda(non_blocking=True) + y = y.cuda(non_blocking=True) + return m, y + + +def train(model, criterion, optimizer, + scheduler, ap, global_step, epoch, amp): + data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) + model.train() + epoch_time = 0 + keep_avg = KeepAverage() + if use_cuda: + batch_n_iter = int( + len(data_loader.dataset) / (c.batch_size * num_gpus)) + else: + batch_n_iter = int(len(data_loader.dataset) / c.batch_size) + end_time = time.time() + c_logger.print_train_start() + for num_iter, data in enumerate(data_loader): + start_time = time.time() + + # format data + m, y = format_data(data) + loader_time = time.time() - end_time + + global_step += 1 + + # compute noisy input + if hasattr(model, 'module'): + y_noisy, noise_scale = model.module.compute_noisy_x(y) + else: + y_noisy, noise_scale = model.compute_noisy_x(y) + + # forward pass + y_hat = model(y_noisy, m, noise_scale) + + # compute losses + loss = criterion(y_noisy, y_hat) + loss_wavegrad_dict = {'wavegrad_loss':loss} + + # backward pass with loss scaling + optimizer.zero_grad() + + if amp is not None: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + if amp: + amp_opt_params = amp.master_params(optimizer) + else: + amp_opt_params = None + + if c.clip_grad > 0: + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), + c.clip_grad) + optimizer.step() + + # schedule update + if scheduler is not None: + scheduler.step() + + # disconnect loss values + loss_dict = dict() + for key, value in loss_wavegrad_dict.items(): + if isinstance(value, int): + loss_dict[key] = value + else: + loss_dict[key] = value.item() + + # epoch/step timing + step_time = time.time() - start_time + epoch_time += step_time + + # get current learning rates + current_lr = list(optimizer.param_groups)[0]['lr'] + + # update avg stats + update_train_values = dict() + for key, value in loss_dict.items(): + update_train_values['avg_' + key] = value + update_train_values['avg_loader_time'] = loader_time + update_train_values['avg_step_time'] = step_time + keep_avg.update_values(update_train_values) + + # print training stats + if global_step % c.print_step == 0: + log_dict = { + 'step_time': [step_time, 2], + 'loader_time': [loader_time, 4], + "current_lr": current_lr, + "grad_norm": grad_norm + } + c_logger.print_train_step(batch_n_iter, num_iter, global_step, + log_dict, loss_dict, keep_avg.avg_values) + + if args.rank == 0: + # plot step stats + if global_step % 10 == 0: + iter_stats = { + "lr": current_lr, + "grad_norm": grad_norm, + "step_time": step_time + } + iter_stats.update(loss_dict) + tb_logger.tb_train_iter_stats(global_step, iter_stats) + + # save checkpoint + if global_step % c.save_step == 0: + if c.checkpoint: + # save model + save_checkpoint(model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict) + + # compute spectrograms + figures = plot_results(y_hat[0], y[0], ap, global_step, 'train') + tb_logger.tb_train_figures(global_step, figures) + + # Sample audio + sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() + tb_logger.tb_train_audios(global_step, + {'train/audio': sample_voice}, + c.audio["sample_rate"]) + end_time = time.time() + + # print epoch stats + c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) + + # Plot Training Epoch Stats + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(keep_avg.avg_values) + if args.rank == 0: + tb_logger.tb_train_epoch_stats(global_step, epoch_stats) + # TODO: plot model stats + # if c.tb_model_param_stats: + # tb_logger.tb_model_weights(model, global_step) + return keep_avg.avg_values, global_step + + +@torch.no_grad() +def evaluate(model, criterion, ap, global_step, epoch): + data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) + model.eval() + epoch_time = 0 + keep_avg = KeepAverage() + end_time = time.time() + c_logger.print_eval_start() + for num_iter, data in enumerate(data_loader): + start_time = time.time() + + # format data + m, y = format_data(data) + loader_time = time.time() - end_time + + global_step += 1 + + # compute noisy input + if hasattr(model, 'module'): + y_noisy, noise_scale = model.module.compute_noisy_x(y) + else: + y_noisy, noise_scale = model.compute_noisy_x(y) + + + # forward pass + y_hat = model(y_noisy, m, noise_scale) + + # compute losses + loss = criterion(y_noisy, y_hat) + loss_wavegrad_dict = {'wavegrad_loss':loss} + + + loss_dict = dict() + for key, value in loss_wavegrad_dict.items(): + if isinstance(value, (int, float)): + loss_dict[key] = value + else: + loss_dict[key] = value.item() + + step_time = time.time() - start_time + epoch_time += step_time + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values['avg_' + key] = value + update_eval_values['avg_loader_time'] = loader_time + update_eval_values['avg_step_time'] = step_time + keep_avg.update_values(update_eval_values) + + # print eval stats + if c.print_eval: + c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) + + if args.rank == 0: + # compute spectrograms + figures = plot_results(y_hat, y, ap, global_step, 'eval') + tb_logger.tb_eval_figures(global_step, figures) + + # Sample audio + sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() + tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, + c.audio["sample_rate"]) + + tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) + + return keep_avg.avg_values + + +# FIXME: move args definition/parsing inside of main? +def main(args): # pylint: disable=redefined-outer-name + # pylint: disable=global-variable-undefined + global train_data, eval_data + print(f" > Loading wavs from: {c.data_path}") + if c.feature_path is not None: + print(f" > Loading features from: {c.feature_path}") + eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + else: + eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) + + # setup audio processor + ap = AudioProcessor(**c.audio) + + # DISTRUBUTED + if num_gpus > 1: + init_distributed(args.rank, num_gpus, args.group_id, + c.distributed["backend"], c.distributed["url"]) + + # setup models + model = setup_generator(c) + + # setup optimizers + optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0) + + # DISTRIBUTED + if c.apex_amp_level: + # pylint: disable=import-outside-toplevel + from apex import amp + from apex.parallel import DistributedDataParallel as DDP + model.cuda() + model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) + else: + amp = None + + # schedulers + scheduler = None + if 'lr_scheduler' in c: + scheduler = getattr(torch.optim.lr_scheduler, c.lr_scheduler) + scheduler = scheduler(optimizer, **c.lr_scheduler_params) + + # setup criterion + criterion = torch.nn.L1Loss().cuda() + + if args.restore_path: + checkpoint = torch.load(args.restore_path, map_location='cpu') + try: + print(" > Restoring Model...") + model.load_state_dict(checkpoint['model']) + print(" > Restoring Optimizer...") + optimizer.load_state_dict(checkpoint['optimizer']) + if 'scheduler' in checkpoint: + print(" > Restoring LR Scheduler...") + scheduler.load_state_dict(checkpoint['scheduler']) + # NOTE: Not sure if necessary + scheduler.optimizer = optimizer + except RuntimeError: + # retore only matching layers. + print(" > Partial model initialization...") + model_dict = model.state_dict() + model_dict = set_init_dict(model_dict, checkpoint['model'], c) + model.load_state_dict(model_dict) + del model_dict + + # DISTRUBUTED + if amp and 'amp' in checkpoint: + amp.load_state_dict(checkpoint['amp']) + + # reset lr if not countinuining training. + for group in optimizer.param_groups: + group['lr'] = c.lr + + print(" > Model restored from step %d" % checkpoint['step'], + flush=True) + args.restore_step = checkpoint['step'] + else: + args.restore_step = 0 + + if use_cuda: + model.cuda() + criterion.cuda() + + # DISTRUBUTED + if num_gpus > 1: + model = DDP(model) + + num_params = count_parameters(model) + print(" > WaveGrad has {} parameters".format(num_params), flush=True) + + if 'best_loss' not in locals(): + best_loss = float('inf') + + global_step = args.restore_step + for epoch in range(0, c.epochs): + c_logger.print_epoch_start(epoch, c.epochs) + _, global_step = train(model, criterion, optimizer, + scheduler, ap, global_step, + epoch, amp) + eval_avg_loss_dict = evaluate(model, criterion, ap, + global_step, epoch) + c_logger.print_epoch_end(epoch, eval_avg_loss_dict) + target_loss = eval_avg_loss_dict[c.target_loss] + best_loss = save_best_model(target_loss, + best_loss, + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + amp_state_dict=amp.state_dict() if amp else None) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--continue_path', + type=str, + help= + 'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', + default='', + required='--config_path' not in sys.argv) + parser.add_argument( + '--restore_path', + type=str, + help='Model file to be restored. Use to finetune a model.', + default='') + parser.add_argument('--config_path', + type=str, + help='Path to config file for training.', + required='--continue_path' not in sys.argv) + parser.add_argument('--debug', + type=bool, + default=False, + help='Do not verify commit integrity to run training.') + + # DISTRUBUTED + parser.add_argument( + '--rank', + type=int, + default=0, + help='DISTRIBUTED: process rank for distributed training.') + parser.add_argument('--group_id', + type=str, + default="", + help='DISTRIBUTED: process group id.') + args = parser.parse_args() + + if args.continue_path != '': + args.output_path = args.continue_path + args.config_path = os.path.join(args.continue_path, 'config.json') + list_of_files = glob.glob( + args.continue_path + + "/*.pth.tar") # * means all if need specific format then *.csv + latest_model_file = max(list_of_files, key=os.path.getctime) + args.restore_path = latest_model_file + print(f" > Training continues for {args.restore_path}") + + # setup output paths and read configs + c = load_config(args.config_path) + # check_config(c) + _ = os.path.dirname(os.path.realpath(__file__)) + + # DISTRIBUTED + if c.apex_amp_level: + print(" > apex AMP level: ", c.apex_amp_level) + + OUT_PATH = args.continue_path + if args.continue_path == '': + OUT_PATH = create_experiment_folder(c.output_path, c.run_name, + args.debug) + + AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') + + c_logger = ConsoleLogger() + + if args.rank == 0: + os.makedirs(AUDIO_PATH, exist_ok=True) + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + copy_config_file(args.config_path, + os.path.join(OUT_PATH, 'config.json'), new_fields) + os.chmod(AUDIO_PATH, 0o775) + os.chmod(OUT_PATH, 0o775) + + LOG_DIR = OUT_PATH + tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') + + # write model desc to tensorboard + tb_logger.tb_add_text('model-description', c['run_description'], 0) + + try: + main(args) + except KeyboardInterrupt: + remove_experiment_folder(OUT_PATH) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(OUT_PATH) + traceback.print_exc() + sys.exit(1) diff --git a/TTS/vocoder/configs/wavegrad_libritts.json b/TTS/vocoder/configs/wavegrad_libritts.json new file mode 100644 index 00000000..79672c71 --- /dev/null +++ b/TTS/vocoder/configs/wavegrad_libritts.json @@ -0,0 +1,103 @@ +{ + "run_name": "wavegrad-libritts", + "run_description": "wavegrad libritts", + + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 24000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": "/home/erogol/Data/libritts/LibriTTS/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // DISTRIBUTED TRAINING + "apex_amp_level": "O1", // amp optimization level. "O1" is currentl supported. + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "target_loss": "avg_wavegrad_loss", // loss value to pick the best model to save after each epoch + + // MODEL PARAMETERS + "generator_model": "wavegrad", + "model_params":{ + "x_conv_channels":32, + "c_conv_channels":768, + "ublock_out_channels": [768, 512, 512, 256, 128], + "dblock_out_channels": [128, 128, 256, 512], + "upsample_factors": [4, 4, 4, 2, 2], + "upsample_dilations": [ + [1, 2, 1, 2], + [1, 2, 1, 2], + [1, 2, 4, 8], + [1, 2, 4, 8], + [1, 2, 4, 8]] + }, + + // DATASET + "data_path": "/home/erogol/Data/libritts/LibriTTS/train-clean-360/", // root data path. It finds all wav files recursively from there. + "feature_path": null, // if you use precomputed features + "seq_len": 6144, // 24 * hop_length + "pad_short": 2000, // additional padding for short wavs + "conv_pad": 0, // additional padding against convolutions applied to spectrograms + "use_noise_augment": false, // add noise to the audio signal for augmentation + "use_cache": true, // use in memory cache to keep the computed features. This might cause OOM. + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 64, // Batch size for training. + + // VALIDATION + "run_eval": true, // enable/disable evaluation run + + // OPTIMIZER + "epochs": 10000, // total number of epochs to train. + "clip_grad": 1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 10000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, + + // PATHS + "output_path": "/home/erogol/Models/LJSpeech/" +} + diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py new file mode 100644 index 00000000..4a70c252 --- /dev/null +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -0,0 +1,113 @@ +import os +import glob +import torch +import random +import numpy as np +from torch.utils.data import Dataset +from multiprocessing import Manager + + +class WaveGradDataset(Dataset): + """ + WaveGrad Dataset searchs for all the wav files under root path + and converts them to acoustic features on the fly and returns + random segments of (audio, feature) couples. + """ + def __init__(self, + ap, + items, + seq_len, + hop_len, + pad_short, + conv_pad=2, + is_training=True, + return_segments=True, + use_noise_augment=False, + use_cache=False, + verbose=False): + + self.ap = ap + self.item_list = items + self.compute_feat = not isinstance(items[0], (tuple, list)) + self.seq_len = seq_len + self.hop_len = hop_len + self.pad_short = pad_short + self.conv_pad = conv_pad + self.is_training = is_training + self.return_segments = return_segments + self.use_cache = use_cache + self.use_noise_augment = use_noise_augment + self.verbose = verbose + + assert seq_len % hop_len == 0, " [!] seq_len has to be a multiple of hop_len." + self.feat_frame_len = seq_len // hop_len + (2 * conv_pad) + + # cache acoustic features + if use_cache: + self.create_feature_cache() + + def create_feature_cache(self): + self.manager = Manager() + self.cache = self.manager.list() + self.cache += [None for _ in range(len(self.item_list))] + + @staticmethod + def find_wav_files(path): + return glob.glob(os.path.join(path, '**', '*.wav'), recursive=True) + + def __len__(self): + return len(self.item_list) + + def __getitem__(self, idx): + item = self.load_item(idx) + return item + + def load_item(self, idx): + """ load (audio, feat) couple """ + if self.compute_feat: + # compute features from wav + wavpath = self.item_list[idx] + # print(wavpath) + + if self.use_cache and self.cache[idx] is not None: + audio, mel = self.cache[idx] + else: + audio = self.ap.load_wav(wavpath) + + if len(audio) < self.seq_len + self.pad_short: + audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \ + mode='constant', constant_values=0.0) + + mel = self.ap.melspectrogram(audio) + else: + + # load precomputed features + wavpath, feat_path = self.item_list[idx] + + if self.use_cache and self.cache[idx] is not None: + audio, mel = self.cache[idx] + else: + audio = self.ap.load_wav(wavpath) + mel = np.load(feat_path) + + # correct the audio length wrt padding applied in stft + audio = np.pad(audio, (0, self.hop_len), mode="edge") + audio = audio[:mel.shape[-1] * self.hop_len] + assert mel.shape[-1] * self.hop_len == audio.shape[-1], f' [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}' + + audio = torch.from_numpy(audio).float().unsqueeze(0) + mel = torch.from_numpy(mel).float().squeeze(0) + + if self.return_segments: + max_mel_start = mel.shape[1] - self.feat_frame_len + mel_start = random.randint(0, max_mel_start) + mel_end = mel_start + self.feat_frame_len + mel = mel[:, mel_start:mel_end] + + audio_start = mel_start * self.hop_len + audio = audio[:, audio_start:audio_start + + self.seq_len] + + if self.use_noise_augment and self.is_training and self.return_segments: + audio = audio + (1 / 32768) * torch.randn_like(audio) + return (mel, audio) diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py new file mode 100644 index 00000000..69bca0a8 --- /dev/null +++ b/TTS/vocoder/layers/wavegrad.py @@ -0,0 +1,150 @@ +import math +import torch +from torch import nn +from torch.nn import functional as F + + +class NoiseLevelEncoding(nn.Module): + """Noise level encoding applying same + encoding vector to all time steps. It is + different than the original implementation.""" + def __init__(self, n_channels): + super().__init__() + self.n_channels = n_channels + self.length = n_channels // 2 + assert n_channels % 2 == 0 + + enc = self.init_encoding(self.length) + self.register_buffer('enc', enc) + + def forward(self, x, noise_level): + """ + Shapes: + x: B x C x T + noise_level: B + """ + return (x + self.encoding(noise_level)[:, :, None]) + + def init_encoding(self, length): + div_by = torch.arange(length) / length + enc = torch.exp(-math.log(1e4) * div_by.unsqueeze(0)) + return enc + + def encoding(self, noise_level): + encoding = noise_level.unsqueeze(1) * self.enc + encoding = torch.cat( + [torch.sin(encoding), torch.cos(encoding)], dim=-1) + return encoding + + +class FiLM(nn.Module): + """Feature-wise Linear Modulation. It combines information from + both noisy waveform and input mel-spectrogram. The FiLM module + produces both scale and bias vectors given inputs, which are + used in a UBlock for feature-wise affine transformation.""" + + def __init__(self, in_channels, out_channels): + super().__init__() + self.encoding = NoiseLevelEncoding(in_channels) + self.conv_in = nn.Conv1d(in_channels, in_channels, 3, padding=1) + self.conv_out = nn.Conv1d(in_channels, out_channels * 2, 3, padding=1) + self._init_parameters() + + def _init_parameters(self): + nn.init.orthogonal_(self.conv_in.weight) + nn.init.orthogonal_(self.conv_out.weight) + + def forward(self, x, noise_scale): + x = self.conv_in(x) + x = F.leaky_relu(x, 0.2) + x = self.encoding(x, noise_scale) + shift, scale = torch.chunk(self.conv_out(x), 2, dim=1) + return shift, scale + + +@torch.jit.script +def shif_and_scale(x, scale, shift): + o = shift + scale * x + return o + + +class UBlock(nn.Module): + def __init__(self, in_channels, hid_channels, upsample_factor, dilations): + super().__init__() + assert len(dilations) == 4 + + self.upsample_factor = upsample_factor + self.shortcut_conv = nn.Conv1d(in_channels, hid_channels, 1) + self.main_block1 = nn.ModuleList([ + nn.Conv1d(in_channels, + hid_channels, + 3, + dilation=dilations[0], + padding=dilations[0]), + nn.Conv1d(hid_channels, + hid_channels, + 3, + dilation=dilations[1], + padding=dilations[1]) + ]) + self.main_block2 = nn.ModuleList([ + nn.Conv1d(hid_channels, + hid_channels, + 3, + dilation=dilations[2], + padding=dilations[2]), + nn.Conv1d(hid_channels, + hid_channels, + 3, + dilation=dilations[3], + padding=dilations[3]) + ]) + + def forward(self, x, shift, scale): + upsample_size = x.shape[-1] * self.upsample_factor + x = F.interpolate(x, size=upsample_size) + res = self.shortcut_conv(x) + + o = F.leaky_relu(x, 0.2) + o = self.main_block1[0](o) + o = shif_and_scale(o, scale, shift) + o = F.leaky_relu(o, 0.2) + o = self.main_block1[1](o) + + o = o + res + res = o + + o = shif_and_scale(o, scale, shift) + o = F.leaky_relu(o, 0.2) + o = self.main_block2[0](o) + o = shif_and_scale(o, scale, shift) + o = F.leaky_relu(o, 0.2) + o = self.main_block2[1](o) + + o = o + res + return o + + +class DBlock(nn.Module): + def __init__(self, in_channels, hid_channels, downsample_factor): + super().__init__() + self.downsample_factor = downsample_factor + self.res_conv = nn.Conv1d(in_channels, hid_channels, 1) + self.main_convs = nn.ModuleList([ + nn.Conv1d(in_channels, hid_channels, 3, dilation=1, padding=1), + nn.Conv1d(hid_channels, hid_channels, 3, dilation=2, padding=2), + nn.Conv1d(hid_channels, hid_channels, 3, dilation=4, padding=4), + ]) + + def forward(self, x): + size = x.shape[-1] // self.downsample_factor + + res = self.res_conv(x) + res = F.interpolate(res, size=size) + + o = F.interpolate(x, size=size) + for layer in self.main_convs: + o = F.leaky_relu(o, 0.2) + o = layer(o) + + return o + res diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py new file mode 100644 index 00000000..6405bea8 --- /dev/null +++ b/TTS/vocoder/models/wavegrad.py @@ -0,0 +1,131 @@ +import numpy as np +import torch +from torch import nn + +from ..layers.wavegrad import DBlock, FiLM, UBlock + + +class Wavegrad(nn.Module): + # pylint: disable=dangerous-default-value + def __init__(self, + in_channels=80, + out_channels=1, + x_conv_channels=32, + c_conv_channels=768, + dblock_out_channels=[128, 128, 256, 512], + ublock_out_channels=[512, 512, 256, 128, 128], + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], + [1, 2, 4, 8], [1, 2, 4, 8]]): + super().__init__() + + assert len(upsample_factors) == len(upsample_dilations) + assert len(upsample_factors) == len(ublock_out_channels) + + # inference time noise schedule params + self.S = 1000 + beta, alpha, alpha_cum, noise_level = self._setup_noise_level() + self.register_buffer('beta', beta) + self.register_buffer('alpha', alpha) + self.register_buffer('alpha_cum', alpha_cum) + self.register_buffer('noise_level', noise_level) + + # setup up-down sampling parameters + self.hop_length = np.prod(upsample_factors) + self.upsample_factors = upsample_factors + self.downsample_factors = upsample_factors[::-1][:-1] + + ### define DBlocks, FiLM layers ### + self.dblocks = nn.ModuleList([ + nn.Conv1d(out_channels, x_conv_channels, 5, padding=2), + ]) + ic = x_conv_channels + self.films = nn.ModuleList([]) + for oc, df in zip(dblock_out_channels, self.downsample_factors): + # print('dblock(', ic, ', ', oc, ', ', df, ")") + layer = DBlock(ic, oc, df) + self.dblocks.append(layer) + + # print('film(', ic, ', ', oc,")") + layer = FiLM(ic, oc) + self.films.append(layer) + ic = oc + # last FiLM block + # print('film(', ic, ', ', dblock_out_channels[-1],")") + self.films.append(FiLM(ic, dblock_out_channels[-1])) + + ### define UBlocks ### + self.c_conv = nn.Conv1d(in_channels, c_conv_channels, 3, padding=1) + self.ublocks = nn.ModuleList([]) + ic = c_conv_channels + for idx, (oc, uf) in enumerate(zip(ublock_out_channels, self.upsample_factors)): + # print('ublock(', ic, ', ', oc, ', ', uf, ")") + layer = UBlock(ic, oc, uf, upsample_dilations[idx]) + self.ublocks.append(layer) + ic = oc + + # define last layer + # print(ic, 'last_conv--', out_channels) + self.last_conv = nn.Conv1d(ic, out_channels, 3, padding=1) + + def _setup_noise_level(self, noise_schedule=None): + """compute noise schedule parameters""" + if noise_schedule is None: + beta = np.linspace(1e-6, 0.01, self.S) + else: + beta = noise_schedule + alpha = 1 - beta + alpha_cum = np.cumprod(alpha) + noise_level = np.concatenate([[1.0], alpha_cum ** 0.5], axis=0) + + beta = torch.from_numpy(beta) + alpha = torch.from_numpy(alpha) + alpha_cum = torch.from_numpy(alpha_cum) + noise_level = torch.from_numpy(noise_level.astype(np.float32)) + return beta, alpha, alpha_cum, noise_level + + def compute_noisy_x(self, x): + B = x.shape[0] + if len(x.shape) == 3: + x = x.squeeze(1) + s = torch.randint(1, self.S + 1, [B]).to(x).long() + l_a, l_b = self.noise_level[s-1], self.noise_level[s] + noise_scale = l_a + torch.rand(B).to(x) * (l_b - l_a) + noise_scale = noise_scale.unsqueeze(1) + noise = torch.randn_like(x) + noisy_x = noise_scale * x + (1.0 - noise_scale**2)**0.5 * noise + return noisy_x.unsqueeze(1), noise_scale[:, 0] + + def forward(self, x, c, noise_scale): + assert len(c.shape) == 3 # B, C, T + assert len(x.shape) == 3 # B, 1, T + o = x + shift_and_scales = [] + for film, dblock in zip(self.films, self.dblocks): + o = dblock(o) + shift_and_scales.append(film(o, noise_scale)) + + o = self.c_conv(c) + for ublock, (film_shift, film_scale) in zip(self.ublocks, + reversed(shift_and_scales)): + o = ublock(o, film_shift, film_scale) + o = self.last_conv(o) + return o + + def inference(self, c): + with torch.no_grad(): + x = torch.randn(c.shape[0], self.hop_length * c.shape[-1]).to(c) + noise_scale = torch.from_numpy( + self.alpha_cum**0.5).float().unsqueeze(1).to(c) + for n in range(len(self.alpha) - 1, -1, -1): + c1 = 1 / self.alpha[n]**0.5 + c2 = (1 - self.alpha[n]) / (1 - self.alpha_cum[n])**0.5 + x = c1 * (x - + c2 * self.forward(x, c, noise_scale[n]).squeeze(1)) + if n > 0: + noise = torch.randn_like(x) + sigma = ((1.0 - self.alpha_cum[n - 1]) / + (1.0 - self.alpha_cum[n]) * self.beta[n])**0.5 + x += sigma * noise + x = torch.clamp(x, -1.0, 1.0) + return x From 193b81b2738bab97143cfb8a3274934bd117e60f Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 16 Oct 2020 16:52:18 +0200 Subject: [PATCH 50/98] add universal_fullband_melgan config --- .../configs/universal_fullband_melgan.json | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 TTS/vocoder/configs/universal_fullband_melgan.json diff --git a/TTS/vocoder/configs/universal_fullband_melgan.json b/TTS/vocoder/configs/universal_fullband_melgan.json new file mode 100644 index 00000000..8882c3e7 --- /dev/null +++ b/TTS/vocoder/configs/universal_fullband_melgan.json @@ -0,0 +1,138 @@ +{ + "run_name": "fullband-melgan", + "run_description": "fullband melgan mean-var scaling", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 24000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": "/data/rw/home/Data/LibriTTS/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // DISTRIBUTED TRAINING + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + // MODEL PARAMETERS + "use_pqmf": false, + + // LOSS PARAMETERS + "use_stft_loss": true, + "use_subband_stft_loss": false, + "use_mse_gan_loss": true, + "use_hinge_gan_loss": false, + "use_feat_match_loss": false, // use only with melgan discriminators + + // loss weights + "stft_loss_weight": 0.5, + "subband_stft_loss_weight": 0.5, + "mse_G_loss_weight": 2.5, + "hinge_G_loss_weight": 2.5, + "feat_match_loss_weight": 25, + + // multiscale stft loss parameters + "stft_loss_params": { + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240] + }, + + "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch + + // DISCRIMINATOR + "discriminator_model": "melgan_multiscale_discriminator", + "discriminator_model_params":{ + "base_channels": 16, + "max_channels":512, + "downsample_factors":[4, 4, 4] + }, + "steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1 + + // GENERATOR + "generator_model": "fullband_melgan_generator", + "generator_model_params": { + "upsample_factors":[8, 8, 4], + "num_res_blocks": 4 + }, + + // DATASET + "data_path": "/data5/rw/home/Data/LibriTTS/LibriTTS/train-clean-360/", + "feature_path": null, + "seq_len": 16384, + "pad_short": 2000, + "conv_pad": 0, + "use_noise_augment": false, + "use_cache": true, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 48, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "epochs": 10000, // total number of epochs to train. + "wd": 0.0, // Weight decay weight. + "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "disc_clip_grad": -1, // Discriminator gradient clipping threshold. + "lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_gen_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_disc_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr_gen": 0.000015625, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_disc": 0.000015625, + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 10, + + // PATHS + "output_path": "/data4/rw/home/Trainings/LJSpeech/" +} + + From a1582a0e12385ba2e1c74f625c9ed22665245155 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 16 Oct 2020 17:53:05 +0200 Subject: [PATCH 51/98] fix distributed training for train_* scripts --- TTS/bin/train_gan_vocoder.py | 113 ++++++++++-------- TTS/bin/train_glow_tts.py | 16 ++- TTS/bin/train_tts.py | 2 + TTS/bin/train_wavegrad.py | 33 +++-- TTS/tts/configs/glow_tts_gated_conv.json | 3 +- .../configs/universal_fullband_melgan.json | 8 +- TTS/vocoder/configs/wavegrad_libritts.json | 6 +- 7 files changed, 99 insertions(+), 82 deletions(-) diff --git a/TTS/bin/train_gan_vocoder.py b/TTS/bin/train_gan_vocoder.py index 12edf048..9ede3647 100644 --- a/TTS/bin/train_gan_vocoder.py +++ b/TTS/bin/train_gan_vocoder.py @@ -19,13 +19,16 @@ from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -# from distribute import (DistributedSampler, apply_gradient_allreduce, -# init_distributed, reduce_tensor) from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, setup_generator) from TTS.vocoder.utils.io import save_best_model, save_checkpoint +# DISTRIBUTED +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data.distributed import DistributedSampler +from TTS.utils.distribute import init_distributed + use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -45,12 +48,12 @@ def setup_loader(ap, is_val=False, verbose=False): use_cache=c.use_cache, verbose=verbose) dataset.shuffle_mapping() - # sampler = DistributedSampler(dataset) if num_gpus > 1 else None + sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None loader = DataLoader(dataset, batch_size=1 if is_val else c.batch_size, - shuffle=True, + shuffle=False if num_gpus > 1 else True, drop_last=False, - sampler=None, + sampler=sampler, num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, pin_memory=False) @@ -243,41 +246,42 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) - # plot step stats - if global_step % 10 == 0: - iter_stats = { - "lr_G": current_lr_G, - "lr_D": current_lr_D, - "step_time": step_time - } - iter_stats.update(loss_dict) - tb_logger.tb_train_iter_stats(global_step, iter_stats) + if args.rank == 0: + # plot step stats + if global_step % 10 == 0: + iter_stats = { + "lr_G": current_lr_G, + "lr_D": current_lr_D, + "step_time": step_time + } + iter_stats.update(loss_dict) + tb_logger.tb_train_iter_stats(global_step, iter_stats) - # save checkpoint - if global_step % c.save_step == 0: - if c.checkpoint: - # save model - save_checkpoint(model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict) + # save checkpoint + if global_step % c.save_step == 0: + if c.checkpoint: + # save model + save_checkpoint(model_G, + optimizer_G, + scheduler_G, + model_D, + optimizer_D, + scheduler_D, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict) - # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') - tb_logger.tb_train_figures(global_step, figures) + # compute spectrograms + figures = plot_results(y_hat_vis, y_G, ap, global_step, + 'train') + tb_logger.tb_train_figures(global_step, figures) - # Sample audio - sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + # Sample audio + sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() + tb_logger.tb_train_audios(global_step, + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -286,7 +290,8 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # Plot Training Epoch Stats epoch_stats = {"epoch_time": epoch_time} epoch_stats.update(keep_avg.avg_values) - tb_logger.tb_train_epoch_stats(global_step, epoch_stats) + if args.rank == 0: + tb_logger.tb_train_epoch_stats(global_step, epoch_stats) # TODO: plot model stats # if c.tb_model_param_stats: # tb_logger.tb_model_weights(model, global_step) @@ -417,20 +422,21 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) if c.print_eval: c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) - # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, 'eval') - tb_logger.tb_eval_figures(global_step, figures) + if args.rank == 0: + # compute spectrograms + figures = plot_results(y_hat, y_G, ap, global_step, 'eval') + tb_logger.tb_eval_figures(global_step, figures) - # Sample audio - sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) + # Sample audio + sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() + tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, + c.audio["sample_rate"]) - # synthesize a full voice + tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) + + # synthesize a full voice data_loader.return_segments = False - tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) - return keep_avg.avg_values @@ -450,9 +456,9 @@ def main(args): # pylint: disable=redefined-outer-name ap = AudioProcessor(**c.audio) # DISTRUBUTED - # if num_gpus > 1: - # init_distributed(args.rank, num_gpus, args.group_id, - # c.distributed["backend"], c.distributed["url"]) + if num_gpus > 1: + init_distributed(args.rank, num_gpus, args.group_id, + c.distributed["backend"], c.distributed["url"]) # setup models model_gen = setup_generator(c) @@ -532,8 +538,9 @@ def main(args): # pylint: disable=redefined-outer-name criterion_disc.cuda() # DISTRUBUTED - # if num_gpus > 1: - # model = apply_gradient_allreduce(model) + if num_gpus > 1: + model_gen = DDP_th(model_gen, device_ids=[args.rank]) + model_disc = DDP_th(model_disc, device_ids=[args.rank]) num_params = count_parameters(model_gen) print(" > Generator has {} parameters".format(num_params), flush=True) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 535bf8fd..e30ddc59 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -11,6 +11,7 @@ import traceback import torch from random import randrange from torch.utils.data import DataLoader + from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import GlowTTSLoss @@ -34,6 +35,13 @@ from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import (NoamLR, check_update, setup_torch_training_env) +# DISTRIBUTED +from apex.parallel import DistributedDataParallel as DDP_apex +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data.distributed import DistributedSampler +from TTS.utils.distribute import init_distributed, reduce_tensor + + use_cuda, num_gpus = setup_torch_training_env(True, False) def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None): @@ -481,10 +489,9 @@ def main(args): # pylint: disable=redefined-outer-name optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = GlowTTSLoss() - if c.apex_amp_level: + if c.apex_amp_level is not None: # pylint: disable=import-outside-toplevel from apex import amp - from apex.parallel import DistributedDataParallel as DDP model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) else: @@ -523,7 +530,10 @@ def main(args): # pylint: disable=redefined-outer-name # DISTRUBUTED if num_gpus > 1: - model = DDP(model) + if c.apex_amp_level is not None: + model = DDP_apex(model) + else: + model = DDP_th(model, device_ids=[args.rank]) if c.noam_schedule: scheduler = NoamLR(optimizer, diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index e4f8bf7a..8029ab21 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -38,8 +38,10 @@ from TTS.utils.training import (NoamLR, adam_weight_decay, check_update, gradual_training_scheduler, set_weight_decay, setup_torch_training_env) + use_cuda, num_gpus = setup_torch_training_env(True, False) + def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None): if is_val and not c.run_eval: loader = None diff --git a/TTS/bin/train_wavegrad.py b/TTS/bin/train_wavegrad.py index 469df638..6d17b4f2 100644 --- a/TTS/bin/train_wavegrad.py +++ b/TTS/bin/train_wavegrad.py @@ -4,12 +4,9 @@ import os import sys import time import traceback -from inspect import signature import torch from torch.utils.data import DataLoader -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.utils.data.distributed import DistributedSampler from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger @@ -20,14 +17,18 @@ from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env -from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data -from TTS.utils.distribute import init_distributed, reduce_tensor -from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, - setup_generator) +from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset +from TTS.vocoder.utils.generic_utils import plot_results, setup_generator from TTS.vocoder.utils.io import save_best_model, save_checkpoint +# DISTRIBUTED +from apex.parallel import DistributedDataParallel as DDP_apex +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.utils.data.distributed import DistributedSampler +from TTS.utils.distribute import init_distributed + + use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -111,11 +112,6 @@ def train(model, criterion, optimizer, else: loss.backward() - if amp: - amp_opt_params = amp.master_params(optimizer) - else: - amp_opt_params = None - if c.clip_grad > 0: grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), c.clip_grad) @@ -279,7 +275,6 @@ def evaluate(model, criterion, ap, global_step, epoch): return keep_avg.avg_values -# FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global train_data, eval_data @@ -305,10 +300,9 @@ def main(args): # pylint: disable=redefined-outer-name optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0) # DISTRIBUTED - if c.apex_amp_level: + if c.apex_amp_level is not None: # pylint: disable=import-outside-toplevel from apex import amp - from apex.parallel import DistributedDataParallel as DDP model.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) else: @@ -363,7 +357,10 @@ def main(args): # pylint: disable=redefined-outer-name # DISTRUBUTED if num_gpus > 1: - model = DDP(model) + if c.apex_amp_level is not None: + model = DDP_apex(model) + else: + model = DDP_th(model, device_ids=[args.rank]) num_params = count_parameters(model) print(" > WaveGrad has {} parameters".format(num_params), flush=True) @@ -447,7 +444,7 @@ if __name__ == '__main__': _ = os.path.dirname(os.path.realpath(__file__)) # DISTRIBUTED - if c.apex_amp_level: + if c.apex_amp_level is not None: print(" > apex AMP level: ", c.apex_amp_level) OUT_PATH = args.continue_path diff --git a/TTS/tts/configs/glow_tts_gated_conv.json b/TTS/tts/configs/glow_tts_gated_conv.json index 5c30e0bc..dbcdbbde 100644 --- a/TTS/tts/configs/glow_tts_gated_conv.json +++ b/TTS/tts/configs/glow_tts_gated_conv.json @@ -54,9 +54,10 @@ "add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model. // DISTRIBUTED TRAINING + "apex_amp_level": null, // APEX amp optimization level. "O1" is currently supported. "distributed":{ "backend": "nccl", - "url": "tcp:\/\/localhost:54321" + "url": "tcp:\/\/localhost:54323" }, "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. diff --git a/TTS/vocoder/configs/universal_fullband_melgan.json b/TTS/vocoder/configs/universal_fullband_melgan.json index 8882c3e7..fe4433c2 100644 --- a/TTS/vocoder/configs/universal_fullband_melgan.json +++ b/TTS/vocoder/configs/universal_fullband_melgan.json @@ -31,13 +31,13 @@ "symmetric_norm": true, // move normalization to range [-1, 1] "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "stats_path": "/data/rw/home/Data/LibriTTS/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "stats_path": "/home/erogol/Data/libritts/LibriTTS/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, // DISTRIBUTED TRAINING "distributed":{ "backend": "nccl", - "url": "tcp:\/\/localhost:54321" + "url": "tcp:\/\/localhost:54324" }, // MODEL PARAMETERS @@ -83,7 +83,7 @@ }, // DATASET - "data_path": "/data5/rw/home/Data/LibriTTS/LibriTTS/train-clean-360/", + "data_path": "/home/erogol/Data/libritts/LibriTTS/train-clean-360/", "feature_path": null, "seq_len": 16384, "pad_short": 2000, @@ -132,7 +132,7 @@ "eval_split_size": 10, // PATHS - "output_path": "/data4/rw/home/Trainings/LJSpeech/" + "output_path": "/home/erogol/Models/" } diff --git a/TTS/vocoder/configs/wavegrad_libritts.json b/TTS/vocoder/configs/wavegrad_libritts.json index 79672c71..98de36c7 100644 --- a/TTS/vocoder/configs/wavegrad_libritts.json +++ b/TTS/vocoder/configs/wavegrad_libritts.json @@ -34,10 +34,10 @@ }, // DISTRIBUTED TRAINING - "apex_amp_level": "O1", // amp optimization level. "O1" is currentl supported. + "apex_amp_level": null, // APEX amp optimization level. "O1" is currently supported. "distributed":{ "backend": "nccl", - "url": "tcp:\/\/localhost:54321" + "url": "tcp:\/\/localhost:54322" }, "target_loss": "avg_wavegrad_loss", // loss value to pick the best model to save after each epoch @@ -47,7 +47,7 @@ "model_params":{ "x_conv_channels":32, "c_conv_channels":768, - "ublock_out_channels": [768, 512, 512, 256, 128], + "ublock_out_channels": [512, 512, 256, 128, 128], "dblock_out_channels": [128, 128, 256, 512], "upsample_factors": [4, 4, 4, 2, 2], "upsample_dilations": [ From 7bcdb7ac3540da3a9377ff8a47ff227041a06963 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 19 Oct 2020 15:44:07 +0200 Subject: [PATCH 52/98] wavegrad updates --- TTS/bin/train_wavegrad.py | 84 +++++++++++++------ .../multiband_melgan_config_mozilla.json | 14 +++- TTS/vocoder/configs/wavegrad_libritts.json | 10 +++ TTS/vocoder/datasets/wavegrad_dataset.py | 7 ++ TTS/vocoder/layers/wavegrad.py | 35 +++++--- TTS/vocoder/models/wavegrad.py | 45 +++++----- 6 files changed, 129 insertions(+), 66 deletions(-) diff --git a/TTS/bin/train_wavegrad.py b/TTS/bin/train_wavegrad.py index 6d17b4f2..e167a4cb 100644 --- a/TTS/bin/train_wavegrad.py +++ b/TTS/bin/train_wavegrad.py @@ -50,22 +50,35 @@ def setup_loader(ap, is_val=False, verbose=False): sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, batch_size=c.batch_size, - shuffle=False if num_gpus > 1 else True, + shuffle=num_gpus <= 1, drop_last=False, sampler=sampler, num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, pin_memory=False) + + return loader def format_data(data): # return a whole audio segment - m, y = data + m, x = data if use_cuda: m = m.cuda(non_blocking=True) - y = y.cuda(non_blocking=True) - return m, y + x = x.cuda(non_blocking=True) + return m, x + + +def format_test_data(data): + # return a whole audio segment + m, x = data + m = m.unsqueeze(0) + x = x.unsqueeze(0) + if use_cuda: + m = m.cuda(non_blocking=True) + x = x.cuda(non_blocking=True) + return m, x def train(model, criterion, optimizer, @@ -81,26 +94,36 @@ def train(model, criterion, optimizer, batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() c_logger.print_train_start() + # setup noise schedule + noise_schedule = c['train_noise_schedule'] + if hasattr(model, 'module'): + model.module.init_noise_schedule(noise_schedule['num_steps'], + noise_schedule['min_val'], + noise_schedule['max_val']) + else: + model.init_noise_schedule(noise_schedule['num_steps'], + noise_schedule['min_val'], + noise_schedule['max_val']) for num_iter, data in enumerate(data_loader): start_time = time.time() # format data - m, y = format_data(data) + m, x = format_data(data) loader_time = time.time() - end_time global_step += 1 # compute noisy input if hasattr(model, 'module'): - y_noisy, noise_scale = model.module.compute_noisy_x(y) + noise, x_noisy, noise_scale = model.module.compute_noisy_x(x) else: - y_noisy, noise_scale = model.compute_noisy_x(y) + noise, x_noisy, noise_scale = model.compute_noisy_x(x) # forward pass - y_hat = model(y_noisy, m, noise_scale) + noise_hat = model(x_noisy, m, noise_scale) # compute losses - loss = criterion(y_noisy, y_hat) + loss = criterion(noise, noise_hat) loss_wavegrad_dict = {'wavegrad_loss':loss} # backward pass with loss scaling @@ -181,15 +204,6 @@ def train(model, criterion, optimizer, OUT_PATH, model_losses=loss_dict) - # compute spectrograms - figures = plot_results(y_hat[0], y[0], ap, global_step, 'train') - tb_logger.tb_train_figures(global_step, figures) - - # Sample audio - sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) end_time = time.time() # print epoch stats @@ -218,23 +232,23 @@ def evaluate(model, criterion, ap, global_step, epoch): start_time = time.time() # format data - m, y = format_data(data) + m, x = format_data(data) loader_time = time.time() - end_time global_step += 1 # compute noisy input if hasattr(model, 'module'): - y_noisy, noise_scale = model.module.compute_noisy_x(y) + noise, x_noisy, noise_scale = model.module.compute_noisy_x(x) else: - y_noisy, noise_scale = model.compute_noisy_x(y) + noise, x_noisy, noise_scale = model.compute_noisy_x(x) # forward pass - y_hat = model(y_noisy, m, noise_scale) + noise_hat = model(x_noisy, m, noise_scale) # compute losses - loss = criterion(y_noisy, y_hat) + loss = criterion(noise, noise_hat) loss_wavegrad_dict = {'wavegrad_loss':loss} @@ -261,14 +275,32 @@ def evaluate(model, criterion, ap, global_step, epoch): c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) if args.rank == 0: + samples = data_loader.dataset.load_test_samples(1) + m, x = format_test_data(samples[0]) + + # setup noise schedule and inference + noise_schedule = c['test_noise_schedule'] + if hasattr(model, 'module'): + model.module.init_noise_schedule(noise_schedule['num_steps'], + noise_schedule['min_val'], + noise_schedule['max_val']) + # compute voice + x_pred = model.module.inference(m) + else: + model.init_noise_schedule(noise_schedule['num_steps'], + noise_schedule['min_val'], + noise_schedule['max_val']) + # compute voice + x_pred = model.inference(m) + # compute spectrograms - figures = plot_results(y_hat, y, ap, global_step, 'eval') + figures = plot_results(x_pred, x, ap, global_step, 'eval') tb_logger.tb_eval_figures(global_step, figures) # Sample audio - sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() + sample_voice = x_pred[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) + c.audio["sample_rate"]) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) diff --git a/TTS/vocoder/configs/multiband_melgan_config_mozilla.json b/TTS/vocoder/configs/multiband_melgan_config_mozilla.json index 35f1642a..4978d42f 100644 --- a/TTS/vocoder/configs/multiband_melgan_config_mozilla.json +++ b/TTS/vocoder/configs/multiband_melgan_config_mozilla.json @@ -92,8 +92,8 @@ // DATASET "data_path": "/home/erogol/Data/MozillaMerged22050/wavs/", "feature_path": null, - "seq_len": 16384, - "pad_short": 2000, + "seq_len": 6144, + "pad_short": 500, "conv_pad": 0, "use_noise_augment": false, "use_cache": true, @@ -102,6 +102,16 @@ // TRAINING "batch_size": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "train_noise_schedule":{ + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 1000 + }, + "test_noise_schedule":{ + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 50 + } // VALIDATION "run_eval": true, diff --git a/TTS/vocoder/configs/wavegrad_libritts.json b/TTS/vocoder/configs/wavegrad_libritts.json index 98de36c7..9bb1154b 100644 --- a/TTS/vocoder/configs/wavegrad_libritts.json +++ b/TTS/vocoder/configs/wavegrad_libritts.json @@ -71,6 +71,16 @@ // TRAINING "batch_size": 64, // Batch size for training. + "train_noise_schedule":{ + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 1000 + }, + "test_noise_schedule":{ + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 50 + }, // VALIDATION "run_eval": true, // enable/disable evaluation run diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 4a70c252..1e4f9e11 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -62,6 +62,13 @@ class WaveGradDataset(Dataset): item = self.load_item(idx) return item + def load_test_samples(self, num_samples): + samples = [] + for idx in range(num_samples): + mel, audio = self.load_item(idx) + samples.append([mel, audio]) + return samples + def load_item(self, idx): """ load (audio, feat) couple """ if self.compute_feat: diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index 69bca0a8..c7549676 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -4,6 +4,16 @@ from torch import nn from torch.nn import functional as F +class Conv1d(nn.Conv1d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.reset_parameters() + + def reset_parameters(self): + nn.init.orthogonal_(self.weight) + nn.init.zeros_(self.bias) + + class NoiseLevelEncoding(nn.Module): """Noise level encoding applying same encoding vector to all time steps. It is @@ -25,7 +35,8 @@ class NoiseLevelEncoding(nn.Module): """ return (x + self.encoding(noise_level)[:, :, None]) - def init_encoding(self, length): + @staticmethod + def init_encoding(length): div_by = torch.arange(length) / length enc = torch.exp(-math.log(1e4) * div_by.unsqueeze(0)) return enc @@ -46,8 +57,8 @@ class FiLM(nn.Module): def __init__(self, in_channels, out_channels): super().__init__() self.encoding = NoiseLevelEncoding(in_channels) - self.conv_in = nn.Conv1d(in_channels, in_channels, 3, padding=1) - self.conv_out = nn.Conv1d(in_channels, out_channels * 2, 3, padding=1) + self.conv_in = Conv1d(in_channels, in_channels, 3, padding=1) + self.conv_out = Conv1d(in_channels, out_channels * 2, 3, padding=1) self._init_parameters() def _init_parameters(self): @@ -74,26 +85,26 @@ class UBlock(nn.Module): assert len(dilations) == 4 self.upsample_factor = upsample_factor - self.shortcut_conv = nn.Conv1d(in_channels, hid_channels, 1) + self.shortcut_conv = Conv1d(in_channels, hid_channels, 1) self.main_block1 = nn.ModuleList([ - nn.Conv1d(in_channels, + Conv1d(in_channels, hid_channels, 3, dilation=dilations[0], padding=dilations[0]), - nn.Conv1d(hid_channels, + Conv1d(hid_channels, hid_channels, 3, dilation=dilations[1], padding=dilations[1]) ]) self.main_block2 = nn.ModuleList([ - nn.Conv1d(hid_channels, + Conv1d(hid_channels, hid_channels, 3, dilation=dilations[2], padding=dilations[2]), - nn.Conv1d(hid_channels, + Conv1d(hid_channels, hid_channels, 3, dilation=dilations[3], @@ -129,11 +140,11 @@ class DBlock(nn.Module): def __init__(self, in_channels, hid_channels, downsample_factor): super().__init__() self.downsample_factor = downsample_factor - self.res_conv = nn.Conv1d(in_channels, hid_channels, 1) + self.res_conv = Conv1d(in_channels, hid_channels, 1) self.main_convs = nn.ModuleList([ - nn.Conv1d(in_channels, hid_channels, 3, dilation=1, padding=1), - nn.Conv1d(hid_channels, hid_channels, 3, dilation=2, padding=2), - nn.Conv1d(hid_channels, hid_channels, 3, dilation=4, padding=4), + Conv1d(in_channels, hid_channels, 3, dilation=1, padding=1), + Conv1d(hid_channels, hid_channels, 3, dilation=2, padding=2), + Conv1d(hid_channels, hid_channels, 3, dilation=4, padding=4), ]) def forward(self, x): diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index 6405bea8..95e5b03a 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -22,14 +22,6 @@ class Wavegrad(nn.Module): assert len(upsample_factors) == len(upsample_dilations) assert len(upsample_factors) == len(ublock_out_channels) - # inference time noise schedule params - self.S = 1000 - beta, alpha, alpha_cum, noise_level = self._setup_noise_level() - self.register_buffer('beta', beta) - self.register_buffer('alpha', alpha) - self.register_buffer('alpha_cum', alpha_cum) - self.register_buffer('noise_level', noise_level) - # setup up-down sampling parameters self.hop_length = np.prod(upsample_factors) self.upsample_factors = upsample_factors @@ -68,21 +60,23 @@ class Wavegrad(nn.Module): # print(ic, 'last_conv--', out_channels) self.last_conv = nn.Conv1d(ic, out_channels, 3, padding=1) - def _setup_noise_level(self, noise_schedule=None): - """compute noise schedule parameters""" - if noise_schedule is None: - beta = np.linspace(1e-6, 0.01, self.S) - else: - beta = noise_schedule - alpha = 1 - beta - alpha_cum = np.cumprod(alpha) - noise_level = np.concatenate([[1.0], alpha_cum ** 0.5], axis=0) + # inference time noise schedule params + self.S = 1000 + self.init_noise_schedule(self.S) - beta = torch.from_numpy(beta) - alpha = torch.from_numpy(alpha) - alpha_cum = torch.from_numpy(alpha_cum) - noise_level = torch.from_numpy(noise_level.astype(np.float32)) - return beta, alpha, alpha_cum, noise_level + + def init_noise_schedule(self, num_iter, min_val=1e-6, max_val=0.01): + """compute noise schedule parameters""" + device = self.last_conv.weight.device + beta = torch.linspace(min_val, max_val, num_iter).to(device) + alpha = 1 - beta + alpha_cum = alpha.cumprod(dim=0) + noise_level = torch.cat([torch.FloatTensor([1]).to(device), alpha_cum ** 0.5]) + + self.register_buffer('beta', beta) + self.register_buffer('alpha', alpha) + self.register_buffer('alpha_cum', alpha_cum) + self.register_buffer('noise_level', noise_level) def compute_noisy_x(self, x): B = x.shape[0] @@ -94,7 +88,7 @@ class Wavegrad(nn.Module): noise_scale = noise_scale.unsqueeze(1) noise = torch.randn_like(x) noisy_x = noise_scale * x + (1.0 - noise_scale**2)**0.5 * noise - return noisy_x.unsqueeze(1), noise_scale[:, 0] + return noise.unsqueeze(1), noisy_x.unsqueeze(1), noise_scale[:, 0] def forward(self, x, c, noise_scale): assert len(c.shape) == 3 # B, C, T @@ -114,9 +108,8 @@ class Wavegrad(nn.Module): def inference(self, c): with torch.no_grad(): - x = torch.randn(c.shape[0], self.hop_length * c.shape[-1]).to(c) - noise_scale = torch.from_numpy( - self.alpha_cum**0.5).float().unsqueeze(1).to(c) + x = torch.randn(c.shape[0], 1, self.hop_length * c.shape[-1]).to(c) + noise_scale = (self.alpha_cum**0.5).unsqueeze(1).to(c) for n in range(len(self.alpha) - 1, -1, -1): c1 = 1 / self.alpha[n]**0.5 c2 = (1 - self.alpha[n]) / (1 - self.alpha_cum[n])**0.5 From f79bbbbd00ff16b70bd087fb79b26323f7dc8358 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 19 Oct 2020 17:56:14 +0200 Subject: [PATCH 53/98] use Adam for wavegras instead of RAdam --- TTS/bin/train_wavegrad.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/TTS/bin/train_wavegrad.py b/TTS/bin/train_wavegrad.py index e167a4cb..04af1595 100644 --- a/TTS/bin/train_wavegrad.py +++ b/TTS/bin/train_wavegrad.py @@ -6,15 +6,19 @@ import time import traceback import torch +# DISTRIBUTED +from apex.parallel import DistributedDataParallel as DDP_apex +from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.optim import Adam from torch.utils.data import DataLoader - +from torch.utils.data.distributed import DistributedSampler from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import (KeepAverage, count_parameters, create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) from TTS.utils.io import copy_config_file, load_config -from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data @@ -22,13 +26,6 @@ from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset from TTS.vocoder.utils.generic_utils import plot_results, setup_generator from TTS.vocoder.utils.io import save_best_model, save_checkpoint -# DISTRIBUTED -from apex.parallel import DistributedDataParallel as DDP_apex -from torch.nn.parallel import DistributedDataParallel as DDP_th -from torch.utils.data.distributed import DistributedSampler -from TTS.utils.distribute import init_distributed - - use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -329,7 +326,7 @@ def main(args): # pylint: disable=redefined-outer-name model = setup_generator(c) # setup optimizers - optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0) + optimizer = Adam(model.parameters(), lr=c.lr, weight_decay=0) # DISTRIBUTED if c.apex_amp_level is not None: From 670f44aa187ed6812352266baa7089533d37fee6 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 26 Oct 2020 16:45:11 +0100 Subject: [PATCH 54/98] enable compute stats by vocoder config --- TTS/bin/compute_statistics.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index ca089d3e..7642f86b 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import os +import glob import argparse import numpy as np @@ -31,7 +32,10 @@ def main(): ap = AudioProcessor(**CONFIG.audio) # load the meta data of target dataset - dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data + if 'data_path' in CONFIG.keys(): + dataset_items = glob.glob(os.path.join(CONFIG.data_path, '**', '*.wav'), recursive=True) + else: + dataset_items = load_meta_data(CONFIG.datasets)[0] # take only train data print(f" > There are {len(dataset_items)} files.") mel_sum = 0 @@ -41,7 +45,7 @@ def main(): N = 0 for item in tqdm(dataset_items): # compute features - wav = ap.load_wav(item[1]) + wav = ap.load_wav(item if isinstance(item, str) else item[1]) linear = ap.spectrogram(wav) mel = ap.melspectrogram(wav) @@ -57,7 +61,7 @@ def main(): linear_mean = linear_sum / N linear_scale = np.sqrt(linear_square_sum / N - linear_mean ** 2) - output_file_path = os.path.join(args.out_path, "scale_stats.npy") + output_file_path = args.out_path stats = {} stats['mel_mean'] = mel_mean stats['mel_std'] = mel_scale @@ -79,7 +83,7 @@ def main(): del CONFIG.audio['clip_norm'] stats['audio_config'] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) - print(f' > scale_stats.npy is saved to {output_file_path}') + print(f' > stats saved to {output_file_path}') if __name__ == "__main__": From c8a4c771a883774cc5195cb71f0be279f5d64013 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 26 Oct 2020 16:46:26 +0100 Subject: [PATCH 55/98] train wavegrad updates --- TTS/bin/train_wavegrad.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/TTS/bin/train_wavegrad.py b/TTS/bin/train_wavegrad.py index 04af1595..db961047 100644 --- a/TTS/bin/train_wavegrad.py +++ b/TTS/bin/train_wavegrad.py @@ -7,8 +7,10 @@ import traceback import torch # DISTRIBUTED -from apex.parallel import DistributedDataParallel as DDP_apex -from torch.nn.parallel import DistributedDataParallel as DDP_th +try: + from apex.parallel import DistributedDataParallel as DDP_apex +except: + from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.optim import Adam from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler @@ -61,6 +63,7 @@ def setup_loader(ap, is_val=False, verbose=False): def format_data(data): # return a whole audio segment m, x = data + x = x.unsqueeze(1) if use_cuda: m = m.cuda(non_blocking=True) x = x.cuda(non_blocking=True) @@ -70,8 +73,8 @@ def format_data(data): def format_test_data(data): # return a whole audio segment m, x = data - m = m.unsqueeze(0) - x = x.unsqueeze(0) + m = m[None, ...] + x = x[None, None, ...] if use_cuda: m = m.cuda(non_blocking=True) x = x.cuda(non_blocking=True) @@ -94,11 +97,11 @@ def train(model, criterion, optimizer, # setup noise schedule noise_schedule = c['train_noise_schedule'] if hasattr(model, 'module'): - model.module.init_noise_schedule(noise_schedule['num_steps'], + model.module.compute_noise_level(noise_schedule['num_steps'], noise_schedule['min_val'], noise_schedule['max_val']) else: - model.init_noise_schedule(noise_schedule['num_steps'], + model.compute_noise_level(noise_schedule['num_steps'], noise_schedule['min_val'], noise_schedule['max_val']) for num_iter, data in enumerate(data_loader): @@ -112,15 +115,17 @@ def train(model, criterion, optimizer, # compute noisy input if hasattr(model, 'module'): - noise, x_noisy, noise_scale = model.module.compute_noisy_x(x) + noise, x_noisy, noise_scale = model.module.compute_y_n(x) else: - noise, x_noisy, noise_scale = model.compute_noisy_x(x) + noise, x_noisy, noise_scale = model.compute_y_n(x) # forward pass noise_hat = model(x_noisy, m, noise_scale) # compute losses loss = criterion(noise, noise_hat) + # if loss.item() > 100: + # breakpoint() loss_wavegrad_dict = {'wavegrad_loss':loss} # backward pass with loss scaling @@ -212,8 +217,8 @@ def train(model, criterion, optimizer, if args.rank == 0: tb_logger.tb_train_epoch_stats(global_step, epoch_stats) # TODO: plot model stats - # if c.tb_model_param_stats: - # tb_logger.tb_model_weights(model, global_step) + if c.tb_model_param_stats: + tb_logger.tb_model_weights(model, global_step) return keep_avg.avg_values, global_step @@ -236,9 +241,9 @@ def evaluate(model, criterion, ap, global_step, epoch): # compute noisy input if hasattr(model, 'module'): - noise, x_noisy, noise_scale = model.module.compute_noisy_x(x) + noise, x_noisy, noise_scale = model.module.compute_y_n(x) else: - noise, x_noisy, noise_scale = model.compute_noisy_x(x) + noise, x_noisy, noise_scale = model.compute_y_n(x) # forward pass @@ -272,19 +277,20 @@ def evaluate(model, criterion, ap, global_step, epoch): c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) if args.rank == 0: + data_loader.dataset.return_segments = False samples = data_loader.dataset.load_test_samples(1) m, x = format_test_data(samples[0]) # setup noise schedule and inference noise_schedule = c['test_noise_schedule'] if hasattr(model, 'module'): - model.module.init_noise_schedule(noise_schedule['num_steps'], + model.module.compute_noise_level(noise_schedule['num_steps'], noise_schedule['min_val'], noise_schedule['max_val']) # compute voice x_pred = model.module.inference(m) else: - model.init_noise_schedule(noise_schedule['num_steps'], + model.compute_noise_level(noise_schedule['num_steps'], noise_schedule['min_val'], noise_schedule['max_val']) # compute voice @@ -300,6 +306,7 @@ def evaluate(model, criterion, ap, global_step, epoch): c.audio["sample_rate"]) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) + data_loader.dataset.return_segments = True return keep_avg.avg_values @@ -333,6 +340,7 @@ def main(args): # pylint: disable=redefined-outer-name # pylint: disable=import-outside-toplevel from apex import amp model.cuda() + # optimizer.cuda() model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) else: amp = None From 5b5b9fcfdde67899031ac3eab7d2f5b52de6dd40 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 26 Oct 2020 16:46:50 +0100 Subject: [PATCH 56/98] wavegrad config updates --- TTS/vocoder/configs/wavegrad_libritts.json | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/TTS/vocoder/configs/wavegrad_libritts.json b/TTS/vocoder/configs/wavegrad_libritts.json index 9bb1154b..64958da2 100644 --- a/TTS/vocoder/configs/wavegrad_libritts.json +++ b/TTS/vocoder/configs/wavegrad_libritts.json @@ -30,11 +30,11 @@ "symmetric_norm": true, // move normalization to range [-1, 1] "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/Data/libritts/LibriTTS/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "stats_path": "/home/erogol/Data/libritts/LibriTTS/scale_stats_wavegrad.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, // DISTRIBUTED TRAINING - "apex_amp_level": null, // APEX amp optimization level. "O1" is currently supported. + "apex_amp_level": "O1", // APEX amp optimization level. "O1" is currently supported. "distributed":{ "backend": "nccl", "url": "tcp:\/\/localhost:54322" @@ -45,8 +45,8 @@ // MODEL PARAMETERS "generator_model": "wavegrad", "model_params":{ - "x_conv_channels":32, - "c_conv_channels":768, + "y_conv_channels":32, + "x_conv_channels":768, "ublock_out_channels": [512, 512, 256, 128, 128], "dblock_out_channels": [128, 128, 256, 512], "upsample_factors": [4, 4, 4, 2, 2], @@ -62,15 +62,15 @@ "data_path": "/home/erogol/Data/libritts/LibriTTS/train-clean-360/", // root data path. It finds all wav files recursively from there. "feature_path": null, // if you use precomputed features "seq_len": 6144, // 24 * hop_length - "pad_short": 2000, // additional padding for short wavs + "pad_short": 0, // additional padding for short wavs "conv_pad": 0, // additional padding against convolutions applied to spectrograms "use_noise_augment": false, // add noise to the audio signal for augmentation - "use_cache": true, // use in memory cache to keep the computed features. This might cause OOM. + "use_cache": false, // use in memory cache to keep the computed features. This might cause OOM. "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. // TRAINING - "batch_size": 64, // Batch size for training. + "batch_size": 96, // Batch size for training. "train_noise_schedule":{ "min_val": 1e-6, "max_val": 1e-2, @@ -87,7 +87,7 @@ // OPTIMIZER "epochs": 10000, // total number of epochs to train. - "clip_grad": 1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "clip_grad": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0 "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate "lr_scheduler_params": { "gamma": 0.5, @@ -96,16 +96,16 @@ "lr": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. + "print_step": 50, // Number of steps to log traning on console. "print_eval": false, // If True, it prints loss values for each step in eval run. "save_step": 10000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. // DATA LOADING "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 10, + "eval_split_size": 256, // PATHS "output_path": "/home/erogol/Models/LJSpeech/" From dc2825dfb20dbbd78a763b2db777d1bcbd32e9e6 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 26 Oct 2020 16:47:09 +0100 Subject: [PATCH 57/98] wavegrad dataset update --- TTS/vocoder/datasets/wavegrad_dataset.py | 65 ++++----- TTS/vocoder/layers/wavegrad.py | 175 +++++++++++------------ 2 files changed, 111 insertions(+), 129 deletions(-) diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 1e4f9e11..c7b07b0d 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -28,7 +28,6 @@ class WaveGradDataset(Dataset): self.ap = ap self.item_list = items - self.compute_feat = not isinstance(items[0], (tuple, list)) self.seq_len = seq_len self.hop_len = hop_len self.pad_short = pad_short @@ -64,57 +63,49 @@ class WaveGradDataset(Dataset): def load_test_samples(self, num_samples): samples = [] + return_segments = self.return_segments + self.return_segments = False for idx in range(num_samples): mel, audio = self.load_item(idx) samples.append([mel, audio]) + self.return_segments = return_segments return samples def load_item(self, idx): """ load (audio, feat) couple """ - if self.compute_feat: - # compute features from wav - wavpath = self.item_list[idx] - # print(wavpath) + # compute features from wav + wavpath = self.item_list[idx] - if self.use_cache and self.cache[idx] is not None: - audio, mel = self.cache[idx] - else: - audio = self.ap.load_wav(wavpath) - - if len(audio) < self.seq_len + self.pad_short: - audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \ - mode='constant', constant_values=0.0) - - mel = self.ap.melspectrogram(audio) + if self.use_cache and self.cache[idx] is not None: + audio = self.cache[idx] else: + audio = self.ap.load_wav(wavpath) - # load precomputed features - wavpath, feat_path = self.item_list[idx] + # correct audio length wrt segment length + if audio.shape[-1] < self.seq_len + self.pad_short: + audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \ + mode='constant', constant_values=0.0) + assert audio.shape[-1] >= self.seq_len + self.pad_short, f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" - if self.use_cache and self.cache[idx] is not None: - audio, mel = self.cache[idx] - else: - audio = self.ap.load_wav(wavpath) - mel = np.load(feat_path) + # correct the audio length wrt hop length + p = (audio.shape[-1] // self.hop_len + 1) * self.hop_len - audio.shape[-1] + audio = np.pad(audio, (0, p), mode='constant', constant_values=0.0) - # correct the audio length wrt padding applied in stft - audio = np.pad(audio, (0, self.hop_len), mode="edge") - audio = audio[:mel.shape[-1] * self.hop_len] - assert mel.shape[-1] * self.hop_len == audio.shape[-1], f' [!] {mel.shape[-1] * self.hop_len} vs {audio.shape[-1]}' - - audio = torch.from_numpy(audio).float().unsqueeze(0) - mel = torch.from_numpy(mel).float().squeeze(0) + if self.use_cache: + self.cache[idx] = audio if self.return_segments: - max_mel_start = mel.shape[1] - self.feat_frame_len - mel_start = random.randint(0, max_mel_start) - mel_end = mel_start + self.feat_frame_len - mel = mel[:, mel_start:mel_end] - - audio_start = mel_start * self.hop_len - audio = audio[:, audio_start:audio_start + - self.seq_len] + max_start = len(audio) - self.seq_len + start = random.randint(0, max_start) + end = start + self.seq_len + audio = audio[start:end] if self.use_noise_augment and self.is_training and self.return_segments: audio = audio + (1 / 32768) * torch.randn_like(audio) + + mel = self.ap.melspectrogram(audio) + mel = mel[..., :-1] + + audio = torch.from_numpy(audio).float() + mel = torch.from_numpy(mel).float().squeeze(0) return (mel, audio) diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index c7549676..0b9dde48 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -1,32 +1,25 @@ -import math +import numpy as np import torch -from torch import nn -from torch.nn import functional as F +import torch.nn as nn +import torch.nn.functional as F + +from math import log as ln class Conv1d(nn.Conv1d): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.reset_parameters() - - def reset_parameters(self): nn.init.orthogonal_(self.weight) nn.init.zeros_(self.bias) -class NoiseLevelEncoding(nn.Module): - """Noise level encoding applying same - encoding vector to all time steps. It is - different than the original implementation.""" +class PositionalEncoding(nn.Module): def __init__(self, n_channels): super().__init__() self.n_channels = n_channels self.length = n_channels // 2 assert n_channels % 2 == 0 - enc = self.init_encoding(self.length) - self.register_buffer('enc', enc) - def forward(self, x, noise_level): """ Shapes: @@ -35,41 +28,34 @@ class NoiseLevelEncoding(nn.Module): """ return (x + self.encoding(noise_level)[:, :, None]) - @staticmethod - def init_encoding(length): - div_by = torch.arange(length) / length - enc = torch.exp(-math.log(1e4) * div_by.unsqueeze(0)) - return enc - def encoding(self, noise_level): - encoding = noise_level.unsqueeze(1) * self.enc - encoding = torch.cat( - [torch.sin(encoding), torch.cos(encoding)], dim=-1) + step = torch.arange( + self.length, dtype=noise_level.dtype, device=noise_level.device) / self.length + encoding = noise_level.unsqueeze(1) * torch.exp( + -ln(1e4) * step.unsqueeze(0)) + encoding = torch.cat([torch.sin(encoding), torch.cos(encoding)], dim=-1) return encoding class FiLM(nn.Module): - """Feature-wise Linear Modulation. It combines information from - both noisy waveform and input mel-spectrogram. The FiLM module - produces both scale and bias vectors given inputs, which are - used in a UBlock for feature-wise affine transformation.""" - - def __init__(self, in_channels, out_channels): + def __init__(self, input_size, output_size): super().__init__() - self.encoding = NoiseLevelEncoding(in_channels) - self.conv_in = Conv1d(in_channels, in_channels, 3, padding=1) - self.conv_out = Conv1d(in_channels, out_channels * 2, 3, padding=1) - self._init_parameters() + self.encoding = PositionalEncoding(input_size) + self.input_conv = nn.Conv1d(input_size, input_size, 3, padding=1) + self.output_conv = nn.Conv1d(input_size, output_size * 2, 3, padding=1) + self.ini_parameters() - def _init_parameters(self): - nn.init.orthogonal_(self.conv_in.weight) - nn.init.orthogonal_(self.conv_out.weight) + def ini_parameters(self): + nn.init.xavier_uniform_(self.input_conv.weight) + nn.init.xavier_uniform_(self.output_conv.weight) + nn.init.zeros_(self.input_conv.bias) + nn.init.zeros_(self.output_conv.bias) def forward(self, x, noise_scale): - x = self.conv_in(x) + x = self.input_conv(x) x = F.leaky_relu(x, 0.2) x = self.encoding(x, noise_scale) - shift, scale = torch.chunk(self.conv_out(x), 2, dim=1) + shift, scale = torch.chunk(self.output_conv(x), 2, dim=1) return shift, scale @@ -80,82 +66,87 @@ def shif_and_scale(x, scale, shift): class UBlock(nn.Module): - def __init__(self, in_channels, hid_channels, upsample_factor, dilations): + def __init__(self, input_size, hidden_size, factor, dilation): super().__init__() - assert len(dilations) == 4 + assert isinstance(dilation, (list, tuple)) + assert len(dilation) == 4 - self.upsample_factor = upsample_factor - self.shortcut_conv = Conv1d(in_channels, hid_channels, 1) - self.main_block1 = nn.ModuleList([ - Conv1d(in_channels, - hid_channels, + self.factor = factor + self.block1 = Conv1d(input_size, hidden_size, 1) + self.block2 = nn.ModuleList([ + Conv1d(input_size, + hidden_size, 3, - dilation=dilations[0], - padding=dilations[0]), - Conv1d(hid_channels, - hid_channels, + dilation=dilation[0], + padding=dilation[0]), + Conv1d(hidden_size, + hidden_size, 3, - dilation=dilations[1], - padding=dilations[1]) + dilation=dilation[1], + padding=dilation[1]) ]) - self.main_block2 = nn.ModuleList([ - Conv1d(hid_channels, - hid_channels, + self.block3 = nn.ModuleList([ + Conv1d(hidden_size, + hidden_size, 3, - dilation=dilations[2], - padding=dilations[2]), - Conv1d(hid_channels, - hid_channels, + dilation=dilation[2], + padding=dilation[2]), + Conv1d(hidden_size, + hidden_size, 3, - dilation=dilations[3], - padding=dilations[3]) + dilation=dilation[3], + padding=dilation[3]) ]) def forward(self, x, shift, scale): - upsample_size = x.shape[-1] * self.upsample_factor - x = F.interpolate(x, size=upsample_size) - res = self.shortcut_conv(x) + block1 = F.interpolate(x, size=x.shape[-1] * self.factor) + block1 = self.block1(block1) - o = F.leaky_relu(x, 0.2) - o = self.main_block1[0](o) - o = shif_and_scale(o, scale, shift) - o = F.leaky_relu(o, 0.2) - o = self.main_block1[1](o) + block2 = F.leaky_relu(x, 0.2) + block2 = F.interpolate(block2, size=x.shape[-1] * self.factor) + block2 = self.block2[0](block2) + # block2 = film_shift + film_scale * block2 + block2 = shif_and_scale(block2, scale, shift) + block2 = F.leaky_relu(block2, 0.2) + block2 = self.block2[1](block2) - o = o + res - res = o + x = block1 + block2 - o = shif_and_scale(o, scale, shift) - o = F.leaky_relu(o, 0.2) - o = self.main_block2[0](o) - o = shif_and_scale(o, scale, shift) - o = F.leaky_relu(o, 0.2) - o = self.main_block2[1](o) + # block3 = film_shift + film_scale * x + block3 = shif_and_scale(x, scale, shift) + block3 = F.leaky_relu(block3, 0.2) + block3 = self.block3[0](block3) + # block3 = film_shift + film_scale * block3 + block3 = shif_and_scale(block3, scale, shift) + block3 = F.leaky_relu(block3, 0.2) + block3 = self.block3[1](block3) - o = o + res - return o + x = x + block3 + return x class DBlock(nn.Module): - def __init__(self, in_channels, hid_channels, downsample_factor): + def __init__(self, input_size, hidden_size, factor): super().__init__() - self.downsample_factor = downsample_factor - self.res_conv = Conv1d(in_channels, hid_channels, 1) - self.main_convs = nn.ModuleList([ - Conv1d(in_channels, hid_channels, 3, dilation=1, padding=1), - Conv1d(hid_channels, hid_channels, 3, dilation=2, padding=2), - Conv1d(hid_channels, hid_channels, 3, dilation=4, padding=4), + self.factor = factor + self.residual_dense = Conv1d(input_size, hidden_size, 1) + self.conv = nn.ModuleList([ + Conv1d(input_size, hidden_size, 3, dilation=1, padding=1), + Conv1d(hidden_size, hidden_size, 3, dilation=2, padding=2), + Conv1d(hidden_size, hidden_size, 3, dilation=4, padding=4), ]) def forward(self, x): - size = x.shape[-1] // self.downsample_factor + size = x.shape[-1] // self.factor - res = self.res_conv(x) - res = F.interpolate(res, size=size) + residual = self.residual_dense(x) + residual = F.interpolate(residual, size=size) + + x = F.interpolate(x, size=size) + for layer in self.conv: + x = F.leaky_relu(x, 0.2) + x = layer(x) + + return x + residual - o = F.interpolate(x, size=size) - for layer in self.main_convs: - o = F.leaky_relu(o, 0.2) - o = layer(o) - return o + res From b76a0be97a8c67df494d5767e75d211184cb2787 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 26 Oct 2020 16:47:18 +0100 Subject: [PATCH 58/98] wavegrad model and layers refactoring --- TTS/vocoder/models/wavegrad.py | 171 +++++++++++++---------------- TTS/vocoder/utils/generic_utils.py | 2 +- 2 files changed, 77 insertions(+), 96 deletions(-) diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index 95e5b03a..cbdb1205 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -2,7 +2,7 @@ import numpy as np import torch from torch import nn -from ..layers.wavegrad import DBlock, FiLM, UBlock +from ..layers.wavegrad import DBlock, FiLM, UBlock, Conv1d class Wavegrad(nn.Module): @@ -10,8 +10,8 @@ class Wavegrad(nn.Module): def __init__(self, in_channels=80, out_channels=1, - x_conv_channels=32, - c_conv_channels=768, + y_conv_channels=32, + x_conv_channels=768, dblock_out_channels=[128, 128, 256, 512], ublock_out_channels=[512, 512, 256, 128, 128], upsample_factors=[5, 5, 3, 2, 2], @@ -19,106 +19,87 @@ class Wavegrad(nn.Module): [1, 2, 4, 8], [1, 2, 4, 8]]): super().__init__() - assert len(upsample_factors) == len(upsample_dilations) - assert len(upsample_factors) == len(ublock_out_channels) + self.hop_len = np.prod(upsample_factors) - # setup up-down sampling parameters - self.hop_length = np.prod(upsample_factors) - self.upsample_factors = upsample_factors - self.downsample_factors = upsample_factors[::-1][:-1] - - ### define DBlocks, FiLM layers ### + # dblocks self.dblocks = nn.ModuleList([ - nn.Conv1d(out_channels, x_conv_channels, 5, padding=2), + Conv1d(1, y_conv_channels, 5, padding=2), ]) - ic = x_conv_channels - self.films = nn.ModuleList([]) - for oc, df in zip(dblock_out_channels, self.downsample_factors): - # print('dblock(', ic, ', ', oc, ', ', df, ")") - layer = DBlock(ic, oc, df) - self.dblocks.append(layer) - - # print('film(', ic, ', ', oc,")") - layer = FiLM(ic, oc) - self.films.append(layer) + ic = y_conv_channels + for oc, df in zip(dblock_out_channels, reversed(upsample_factors)): + self.dblocks.append(DBlock(ic, oc, df)) ic = oc - # last FiLM block - # print('film(', ic, ', ', dblock_out_channels[-1],")") - self.films.append(FiLM(ic, dblock_out_channels[-1])) - ### define UBlocks ### - self.c_conv = nn.Conv1d(in_channels, c_conv_channels, 3, padding=1) + # film + self.film = nn.ModuleList([]) + ic = y_conv_channels + for oc in reversed(ublock_out_channels): + self.film.append(FiLM(ic, oc)) + ic = oc + + # ublocks self.ublocks = nn.ModuleList([]) - ic = c_conv_channels - for idx, (oc, uf) in enumerate(zip(ublock_out_channels, self.upsample_factors)): - # print('ublock(', ic, ', ', oc, ', ', uf, ")") - layer = UBlock(ic, oc, uf, upsample_dilations[idx]) - self.ublocks.append(layer) + ic = x_conv_channels + for oc, uf, ud in zip(ublock_out_channels, upsample_factors, upsample_dilations): + self.ublocks.append(UBlock(ic, oc, uf, ud)) ic = oc - # define last layer - # print(ic, 'last_conv--', out_channels) - self.last_conv = nn.Conv1d(ic, out_channels, 3, padding=1) + self.x_conv = Conv1d(in_channels, x_conv_channels, 3, padding=1) + self.out_conv = Conv1d(oc, out_channels, 3, padding=1) - # inference time noise schedule params - self.S = 1000 - self.init_noise_schedule(self.S) + def forward(self, x, spectrogram, noise_scale): + downsampled = [] + for film, layer in zip(self.film, self.dblocks): + x = layer(x) + downsampled.append(film(x, noise_scale)) - - def init_noise_schedule(self, num_iter, min_val=1e-6, max_val=0.01): - """compute noise schedule parameters""" - device = self.last_conv.weight.device - beta = torch.linspace(min_val, max_val, num_iter).to(device) - alpha = 1 - beta - alpha_cum = alpha.cumprod(dim=0) - noise_level = torch.cat([torch.FloatTensor([1]).to(device), alpha_cum ** 0.5]) - - self.register_buffer('beta', beta) - self.register_buffer('alpha', alpha) - self.register_buffer('alpha_cum', alpha_cum) - self.register_buffer('noise_level', noise_level) - - def compute_noisy_x(self, x): - B = x.shape[0] - if len(x.shape) == 3: - x = x.squeeze(1) - s = torch.randint(1, self.S + 1, [B]).to(x).long() - l_a, l_b = self.noise_level[s-1], self.noise_level[s] - noise_scale = l_a + torch.rand(B).to(x) * (l_b - l_a) - noise_scale = noise_scale.unsqueeze(1) - noise = torch.randn_like(x) - noisy_x = noise_scale * x + (1.0 - noise_scale**2)**0.5 * noise - return noise.unsqueeze(1), noisy_x.unsqueeze(1), noise_scale[:, 0] - - def forward(self, x, c, noise_scale): - assert len(c.shape) == 3 # B, C, T - assert len(x.shape) == 3 # B, 1, T - o = x - shift_and_scales = [] - for film, dblock in zip(self.films, self.dblocks): - o = dblock(o) - shift_and_scales.append(film(o, noise_scale)) - - o = self.c_conv(c) - for ublock, (film_shift, film_scale) in zip(self.ublocks, - reversed(shift_and_scales)): - o = ublock(o, film_shift, film_scale) - o = self.last_conv(o) - return o - - def inference(self, c): - with torch.no_grad(): - x = torch.randn(c.shape[0], 1, self.hop_length * c.shape[-1]).to(c) - noise_scale = (self.alpha_cum**0.5).unsqueeze(1).to(c) - for n in range(len(self.alpha) - 1, -1, -1): - c1 = 1 / self.alpha[n]**0.5 - c2 = (1 - self.alpha[n]) / (1 - self.alpha_cum[n])**0.5 - x = c1 * (x - - c2 * self.forward(x, c, noise_scale[n]).squeeze(1)) - if n > 0: - noise = torch.randn_like(x) - sigma = ((1.0 - self.alpha_cum[n - 1]) / - (1.0 - self.alpha_cum[n]) * self.beta[n])**0.5 - x += sigma * noise - x = torch.clamp(x, -1.0, 1.0) + x = self.x_conv(spectrogram) + for layer, (film_shift, film_scale) in zip(self.ublocks, + reversed(downsampled)): + x = layer(x, film_shift, film_scale) + x = self.out_conv(x) return x + + @torch.no_grad() + def inference(self, x): + y_n = torch.randn(x.shape[0], 1, self.hop_len * x.shape[-1], dtype=torch.float32).to(x) + sqrt_alpha_hat = self.noise_level.unsqueeze(1).to(x) + for n in range(len(self.alpha) - 1, -1, -1): + y_n = self.c1[n] * (y_n - + self.c2[n] * self.forward(y_n, x, sqrt_alpha_hat[n]).squeeze(1)) + if n > 0: + z = torch.randn_like(y_n) + y_n += self.sigma[n - 1] * z + y_n.clamp_(-1.0, 1.0) + return y_n + + + def compute_y_n(self, y_0): + self.noise_level = self.noise_level.to(y_0) + if len(y_0.shape) == 3: + y_0 = y_0.squeeze(1) + s = torch.randint(1, self.num_steps + 1, [y_0.shape[0]]) + l_a, l_b = self.noise_level[s-1], self.noise_level[s] + noise_scale = l_a + torch.rand(y_0.shape[0]).to(y_0) * (l_b - l_a) + noise_scale = noise_scale.unsqueeze(1) + noise = torch.randn_like(y_0) + noisy_audio = noise_scale * y_0 + (1.0 - noise_scale**2)**0.5 * noise + return noise.unsqueeze(1), noisy_audio.unsqueeze(1), noise_scale[:, 0] + + def compute_noise_level(self, num_steps, min_val, max_val): + beta = np.linspace(min_val, max_val, num_steps) + alpha = 1 - beta + alpha_hat = np.cumprod(alpha) + noise_level = np.concatenate([[1.0], alpha_hat ** 0.5], axis=0) + + self.num_steps = num_steps + self.beta = torch.tensor(beta.astype(np.float32)) + self.alpha = torch.tensor(alpha.astype(np.float32)) + self.alpha_hat = torch.tensor(alpha_hat.astype(np.float32)) + self.noise_level = torch.tensor(noise_level.astype(np.float32)) + + self.c1 = 1 / self.alpha**0.5 + self.c2 = (1 - self.alpha) / (1 - self.alpha_hat)**0.5 + self.sigma = ((1.0 - self.alpha_hat[:-1]) / (1.0 - self.alpha_hat[1:]) * self.beta[1:])**0.5 + + diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index d0eb0657..761b14d7 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -119,7 +119,7 @@ def setup_generator(c): in_channels=c['audio']['num_mels'], out_channels=1, x_conv_channels=c['model_params']['x_conv_channels'], - c_conv_channels=c['model_params']['c_conv_channels'], + y_conv_channels=c['model_params']['y_conv_channels'], dblock_out_channels=c['model_params']['dblock_out_channels'], ublock_out_channels=c['model_params']['ublock_out_channels'], upsample_factors=c['model_params']['upsample_factors'], From a3213762ae9deabcc6dc6e0ca6af4acfcd89718b Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 26 Oct 2020 17:23:28 +0100 Subject: [PATCH 59/98] update wavegrad tests --- tests/test_wavegrad_layers.py | 80 +++++++++++++++++++++++++++++++++++ tests/test_wavegrad_train.py | 57 +++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 tests/test_wavegrad_layers.py create mode 100644 tests/test_wavegrad_train.py diff --git a/tests/test_wavegrad_layers.py b/tests/test_wavegrad_layers.py new file mode 100644 index 00000000..a1c6a7e5 --- /dev/null +++ b/tests/test_wavegrad_layers.py @@ -0,0 +1,80 @@ +import torch + +from TTS.vocoder.layers.wavegrad import PositionalEncoding, FiLM, UBlock, DBlock +from TTS.vocoder.models.wavegrad import Wavegrad + + +def test_positional_encoding(): + layer = PositionalEncoding(50) + inp = torch.rand(32, 50, 100) + nl = torch.rand(32) + o = layer(inp, nl) + + assert o.shape[0] == 32 + assert o.shape[1] == 50 + assert o.shape[2] == 100 + assert isinstance(o, torch.FloatTensor) + + +def test_film(): + layer = FiLM(50, 76) + inp = torch.rand(32, 50, 100) + nl = torch.rand(32) + shift, scale = layer(inp, nl) + + assert shift.shape[0] == 32 + assert shift.shape[1] == 76 + assert shift.shape[2] == 100 + assert isinstance(shift, torch.FloatTensor) + + assert scale.shape[0] == 32 + assert scale.shape[1] == 76 + assert scale.shape[2] == 100 + assert isinstance(scale, torch.FloatTensor) + + +def test_ublock(): + inp1 = torch.rand(32, 50, 100) + inp2 = torch.rand(32, 50, 50) + nl = torch.rand(32) + + layer_film = FiLM(50, 100) + layer = UBlock(50, 100, 2, [1, 2, 4, 8]) + + scale, shift = layer_film(inp1, nl) + o = layer(inp2, shift, scale) + + assert o.shape[0] == 32 + assert o.shape[1] == 100 + assert o.shape[2] == 100 + assert isinstance(o, torch.FloatTensor) + + +def test_dblock(): + inp = torch.rand(32, 50, 130) + layer = DBlock(50, 100, 2) + o = layer(inp) + + assert o.shape[0] == 32 + assert o.shape[1] == 100 + assert o.shape[2] == 65 + assert isinstance(o, torch.FloatTensor) + + +def test_wavegrad_forward(): + x = torch.rand(32, 1, 20 * 300) + c = torch.rand(32, 80, 20) + noise_scale = torch.rand(32) + + model = Wavegrad(in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], + [1, 2, 4, 8], [1, 2, 4, 8], + [1, 2, 4, 8]]) + o = model.forward(x, c, noise_scale) + + assert o.shape[0] == 32 + assert o.shape[1] == 1 + assert o.shape[2] == 20 * 300 + assert isinstance(o, torch.FloatTensor) diff --git a/tests/test_wavegrad_train.py b/tests/test_wavegrad_train.py new file mode 100644 index 00000000..1fd1d10e --- /dev/null +++ b/tests/test_wavegrad_train.py @@ -0,0 +1,57 @@ +import copy +import os +import unittest + +import torch +from tests import get_tests_input_path +from torch import nn, optim + +from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.utils.io import load_config +from TTS.utils.audio import AudioProcessor + +#pylint: disable=unused-variable + +torch.manual_seed(1) +use_cuda = torch.cuda.is_available() +device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + + +class WavegradTrainTest(unittest.TestCase): + def test_train_step(self): # pylint: disable=no-self-use + """Test if all layers are updated in a basic training cycle""" + input_dummy = torch.rand(8, 1, 20 * 300).to(device) + mel_spec = torch.rand(8, 80, 20).to(device) + + criterion = torch.nn.L1Loss().to(device) + model = Wavegrad(in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], + [1, 2, 4, 8], [1, 2, 4, 8], + [1, 2, 4, 8]]) + model.train() + model.to(device) + model_ref = copy.deepcopy(model) + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + assert (param - param_ref).sum() == 0, param + count += 1 + optimizer = optim.Adam(model.parameters(), lr=0.001) + for i in range(5): + y_hat = model.forward(input_dummy, mel_spec, torch.rand(8).to(device)) + optimizer.zero_grad() + loss = criterion(y_hat, input_dummy) + loss.backward() + optimizer.step() + # check parameter changes + count = 0 + for param, param_ref in zip(model.parameters(), + model_ref.parameters()): + # ignore pre-higway layer since it works conditional + # if count not in [145, 59]: + assert (param != param_ref).any( + ), "param {} with shape {} not updated!! \n{}\n{}".format( + count, param.shape, param, param_ref) + count += 1 \ No newline at end of file From 14c2381207c5972359b2af450a233730ff877ee1 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 27 Oct 2020 12:06:57 +0100 Subject: [PATCH 60/98] weight norm and torch based amp training for wavegrad --- TTS/bin/train_wavegrad.py | 90 ++++++++++------------ TTS/vocoder/configs/wavegrad_libritts.json | 4 +- TTS/vocoder/layers/wavegrad.py | 86 +++++++++++++-------- 3 files changed, 97 insertions(+), 83 deletions(-) diff --git a/TTS/bin/train_wavegrad.py b/TTS/bin/train_wavegrad.py index db961047..83e5d78b 100644 --- a/TTS/bin/train_wavegrad.py +++ b/TTS/bin/train_wavegrad.py @@ -7,10 +7,7 @@ import traceback import torch # DISTRIBUTED -try: - from apex.parallel import DistributedDataParallel as DDP_apex -except: - from torch.nn.parallel import DistributedDataParallel as DDP_th +from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.optim import Adam from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler @@ -82,7 +79,7 @@ def format_test_data(data): def train(model, criterion, optimizer, - scheduler, ap, global_step, epoch, amp): + scheduler, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 @@ -104,6 +101,7 @@ def train(model, criterion, optimizer, model.compute_noise_level(noise_schedule['num_steps'], noise_schedule['min_val'], noise_schedule['max_val']) + scaler = torch.cuda.amp.GradScaler() for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -113,39 +111,46 @@ def train(model, criterion, optimizer, global_step += 1 - # compute noisy input - if hasattr(model, 'module'): - noise, x_noisy, noise_scale = model.module.compute_y_n(x) - else: - noise, x_noisy, noise_scale = model.compute_y_n(x) + with torch.cuda.amp.autocast(): + # compute noisy input + if hasattr(model, 'module'): + noise, x_noisy, noise_scale = model.module.compute_y_n(x) + else: + noise, x_noisy, noise_scale = model.compute_y_n(x) - # forward pass - noise_hat = model(x_noisy, m, noise_scale) + # forward pass + noise_hat = model(x_noisy, m, noise_scale) - # compute losses - loss = criterion(noise, noise_hat) - # if loss.item() > 100: - # breakpoint() + # compute losses + loss = criterion(noise, noise_hat) loss_wavegrad_dict = {'wavegrad_loss':loss} - # backward pass with loss scaling + # check nan loss + if torch.isnan(loss).any(): + raise RuntimeError(f'Detected NaN loss at step {self.step}.') + optimizer.zero_grad() - if amp is not None: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - if c.clip_grad > 0: - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) - optimizer.step() - - # schedule update + # schedule update if scheduler is not None: scheduler.step() + # backward pass with loss scaling + if c.mixed_precision: + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), + c.clip_grad) + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), + c.clip_grad) + optimizer.step() + + + # disconnect loss values loss_dict = dict() for key, value in loss_wavegrad_dict.items(): @@ -175,7 +180,7 @@ def train(model, criterion, optimizer, 'step_time': [step_time, 2], 'loader_time': [loader_time, 4], "current_lr": current_lr, - "grad_norm": grad_norm + "grad_norm": grad_norm.item() } c_logger.print_train_step(batch_n_iter, num_iter, global_step, log_dict, loss_dict, keep_avg.avg_values) @@ -185,7 +190,7 @@ def train(model, criterion, optimizer, if global_step % 10 == 0: iter_stats = { "lr": current_lr, - "grad_norm": grad_norm, + "grad_norm": grad_norm.item(), "step_time": step_time } iter_stats.update(loss_dict) @@ -335,16 +340,6 @@ def main(args): # pylint: disable=redefined-outer-name # setup optimizers optimizer = Adam(model.parameters(), lr=c.lr, weight_decay=0) - # DISTRIBUTED - if c.apex_amp_level is not None: - # pylint: disable=import-outside-toplevel - from apex import amp - model.cuda() - # optimizer.cuda() - model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) - else: - amp = None - # schedulers scheduler = None if 'lr_scheduler' in c: @@ -374,10 +369,6 @@ def main(args): # pylint: disable=redefined-outer-name model.load_state_dict(model_dict) del model_dict - # DISTRUBUTED - if amp and 'amp' in checkpoint: - amp.load_state_dict(checkpoint['amp']) - # reset lr if not countinuining training. for group in optimizer.param_groups: group['lr'] = c.lr @@ -410,7 +401,7 @@ def main(args): # pylint: disable=redefined-outer-name c_logger.print_epoch_start(epoch, c.epochs) _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step, - epoch, amp) + epoch) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) @@ -426,8 +417,7 @@ def main(args): # pylint: disable=redefined-outer-name global_step, epoch, OUT_PATH, - model_losses=eval_avg_loss_dict, - amp_state_dict=amp.state_dict() if amp else None) + model_losses=eval_avg_loss_dict) if __name__ == '__main__': @@ -481,8 +471,8 @@ if __name__ == '__main__': _ = os.path.dirname(os.path.realpath(__file__)) # DISTRIBUTED - if c.apex_amp_level is not None: - print(" > apex AMP level: ", c.apex_amp_level) + if c.mixed_precision: + print(" > Mixed precision is enabled") OUT_PATH = args.continue_path if args.continue_path == '': diff --git a/TTS/vocoder/configs/wavegrad_libritts.json b/TTS/vocoder/configs/wavegrad_libritts.json index 64958da2..5720a482 100644 --- a/TTS/vocoder/configs/wavegrad_libritts.json +++ b/TTS/vocoder/configs/wavegrad_libritts.json @@ -34,7 +34,7 @@ }, // DISTRIBUTED TRAINING - "apex_amp_level": "O1", // APEX amp optimization level. "O1" is currently supported. + "mixed_precision": true, // enable torch mixed precision training (true, false) "distributed":{ "backend": "nccl", "url": "tcp:\/\/localhost:54322" @@ -98,7 +98,7 @@ // TENSORBOARD and LOGGING "print_step": 50, // Number of steps to log traning on console. "print_eval": false, // If True, it prints loss values for each step in eval run. - "save_step": 10000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "save_step": 5000, // Number of training steps expected to plot training stats on TB and save model checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index 0b9dde48..a72f2837 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -2,6 +2,7 @@ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +from torch.nn.utils import weight_norm from math import log as ln @@ -13,36 +14,59 @@ class Conv1d(nn.Conv1d): nn.init.zeros_(self.bias) +# class PositionalEncoding(nn.Module): +# def __init__(self, n_channels): +# super().__init__() +# self.n_channels = n_channels +# self.length = n_channels // 2 +# assert n_channels % 2 == 0 + +# def forward(self, x, noise_level): +# """ +# Shapes: +# x: B x C x T +# noise_level: B +# """ +# return (x + self.encoding(noise_level)[:, :, None]) + +# def encoding(self, noise_level): +# step = torch.arange( +# self.length, dtype=noise_level.dtype, device=noise_level.device) / self.length +# encoding = noise_level.unsqueeze(1) * torch.exp( +# -ln(1e4) * step.unsqueeze(0)) +# encoding = torch.cat([torch.sin(encoding), torch.cos(encoding)], dim=-1) +# return encoding + + class PositionalEncoding(nn.Module): - def __init__(self, n_channels): + def __init__(self, n_channels, max_len=10000): super().__init__() self.n_channels = n_channels - self.length = n_channels // 2 - assert n_channels % 2 == 0 + self.max_len = max_len + self.C = 5000 + self.pe = torch.zeros(0, 0) def forward(self, x, noise_level): - """ - Shapes: - x: B x C x T - noise_level: B - """ - return (x + self.encoding(noise_level)[:, :, None]) + if x.shape[2] > self.pe.shape[1]: + self.init_pe_matrix(x.shape[1] ,x.shape[2], x) + return x + noise_level[..., None, None] + self.pe[:, :x.size(2)].repeat(x.shape[0], 1, 1) / self.C - def encoding(self, noise_level): - step = torch.arange( - self.length, dtype=noise_level.dtype, device=noise_level.device) / self.length - encoding = noise_level.unsqueeze(1) * torch.exp( - -ln(1e4) * step.unsqueeze(0)) - encoding = torch.cat([torch.sin(encoding), torch.cos(encoding)], dim=-1) - return encoding + def init_pe_matrix(self, n_channels, max_len, x): + pe = torch.zeros(max_len, n_channels) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.pow(10000, torch.arange(0, n_channels, 2).float() / n_channels) + + pe[:, 0::2] = torch.sin(position / div_term) + pe[:, 1::2] = torch.cos(position / div_term) + self.pe = pe.transpose(0, 1).to(x) class FiLM(nn.Module): def __init__(self, input_size, output_size): super().__init__() self.encoding = PositionalEncoding(input_size) - self.input_conv = nn.Conv1d(input_size, input_size, 3, padding=1) - self.output_conv = nn.Conv1d(input_size, output_size * 2, 3, padding=1) + self.input_conv = weight_norm(nn.Conv1d(input_size, input_size, 3, padding=1)) + self.output_conv = weight_norm(nn.Conv1d(input_size, output_size * 2, 3, padding=1)) self.ini_parameters() def ini_parameters(self): @@ -72,30 +96,30 @@ class UBlock(nn.Module): assert len(dilation) == 4 self.factor = factor - self.block1 = Conv1d(input_size, hidden_size, 1) + self.block1 = weight_norm(Conv1d(input_size, hidden_size, 1)) self.block2 = nn.ModuleList([ - Conv1d(input_size, + weight_norm(Conv1d(input_size, hidden_size, 3, dilation=dilation[0], - padding=dilation[0]), - Conv1d(hidden_size, + padding=dilation[0])), + weight_norm(Conv1d(hidden_size, hidden_size, 3, dilation=dilation[1], - padding=dilation[1]) + padding=dilation[1])) ]) self.block3 = nn.ModuleList([ - Conv1d(hidden_size, + weight_norm(Conv1d(hidden_size, hidden_size, 3, dilation=dilation[2], - padding=dilation[2]), - Conv1d(hidden_size, + padding=dilation[2])), + weight_norm(Conv1d(hidden_size, hidden_size, 3, dilation=dilation[3], - padding=dilation[3]) + padding=dilation[3])) ]) def forward(self, x, shift, scale): @@ -129,11 +153,11 @@ class DBlock(nn.Module): def __init__(self, input_size, hidden_size, factor): super().__init__() self.factor = factor - self.residual_dense = Conv1d(input_size, hidden_size, 1) + self.residual_dense = weight_norm(Conv1d(input_size, hidden_size, 1)) self.conv = nn.ModuleList([ - Conv1d(input_size, hidden_size, 3, dilation=1, padding=1), - Conv1d(hidden_size, hidden_size, 3, dilation=2, padding=2), - Conv1d(hidden_size, hidden_size, 3, dilation=4, padding=4), + weight_norm(Conv1d(input_size, hidden_size, 3, dilation=1, padding=1)), + weight_norm(Conv1d(hidden_size, hidden_size, 3, dilation=2, padding=2)), + weight_norm(Conv1d(hidden_size, hidden_size, 3, dilation=4, padding=4)), ]) def forward(self, x): From 946a0c0fb9f9b8543077dff38946e36ed867365a Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 29 Oct 2020 15:45:50 +0100 Subject: [PATCH 61/98] bug fixes for single speaker glow-tts, enable torch based amp. Make amp optional for wavegrad. Bug fixes for synthesis setup for glow-tts --- TTS/bin/train_glow_tts.py | 95 +++++++++++++++------------------- TTS/bin/train_wavegrad.py | 9 ++-- TTS/tts/layers/tacotron2.py | 2 +- TTS/tts/utils/generic_utils.py | 2 +- TTS/tts/utils/synthesis.py | 2 +- TTS/utils/generic_utils.py | 17 ++++-- TTS/vocoder/layers/wavegrad.py | 3 -- 7 files changed, 65 insertions(+), 65 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index e30ddc59..9358deb2 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -15,8 +15,6 @@ from torch.utils.data import DataLoader from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import GlowTTSLoss -from TTS.tts.utils.distribute import (DistributedSampler, init_distributed, - reduce_tensor) from TTS.tts.utils.generic_utils import setup_model, check_config_tts from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score @@ -28,7 +26,8 @@ from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger from TTS.utils.generic_utils import (KeepAverage, count_parameters, create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict) + remove_experiment_folder, set_init_dict, + set_amp_context) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger @@ -36,7 +35,6 @@ from TTS.utils.training import (NoamLR, check_update, setup_torch_training_env) # DISTRIBUTED -from apex.parallel import DistributedDataParallel as DDP_apex from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.utils.data.distributed import DistributedSampler from TTS.utils.distribute import init_distributed, reduce_tensor @@ -157,7 +155,7 @@ def data_depended_init(model, ap, speaker_mapping=None): def train(model, criterion, optimizer, scheduler, - ap, global_step, epoch, amp, speaker_mapping=None): + ap, global_step, epoch, speaker_mapping=None): data_loader = setup_loader(ap, 1, is_val=False, verbose=(epoch == 0), speaker_mapping=speaker_mapping) model.train() @@ -170,6 +168,7 @@ def train(model, criterion, optimizer, scheduler, batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() c_logger.print_train_start() + scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -180,33 +179,38 @@ def train(model, criterion, optimizer, scheduler, loader_time = time.time() - end_time global_step += 1 + optimizer.zero_grad() + + # forward pass model + with set_amp_context(c.mixed_precision): + z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( + text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids) + + # compute loss + loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, + o_dur_log, o_total_dur, text_lengths) + + # backward pass with loss scaling + if c.mixed_precision: + scaler.scale(loss_dict['loss']).backward() + scaler.unscale_(optimizer) + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), + c.grad_clip) + scaler.step(optimizer) + scaler.update() + else: + loss_dict['loss'].backward() + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), + c.grad_clip) + optimizer.step() + + + grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True) + optimizer.step() # setup lr if c.noam_schedule: scheduler.step() - optimizer.zero_grad() - - # forward pass model - z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( - text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids) - - # compute loss - loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, - o_dur_log, o_total_dur, text_lengths) - - # backward pass - DISTRIBUTED - if amp is not None: - with amp.scale_loss(loss_dict['loss'], optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss_dict['loss'].backward() - - if amp: - amp_opt_params = amp.master_params(optimizer) - else: - amp_opt_params = None - grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True, amp_opt_params=amp_opt_params) - optimizer.step() # current_lr current_lr = optimizer.param_groups[0]['lr'] @@ -269,12 +273,12 @@ def train(model, criterion, optimizer, scheduler, if c.checkpoint: # save model save_checkpoint(model, optimizer, global_step, epoch, 1, OUT_PATH, - model_loss=loss_dict['loss'], - amp_state_dict=amp.state_dict() if amp else None) + model_loss=loss_dict['loss']) # Diagnostic visualizations # direct pass on model for spec predictions - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=speaker_ids[:1]) + target_speaker = None if speaker_ids is None else speaker_ids[:1] + spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker) spec_pred = spec_pred.permute(0, 2, 1) gt_spec = mel_input.permute(0, 2, 1) const_spec = spec_pred[0].data.cpu().numpy() @@ -367,10 +371,11 @@ def evaluate(model, criterion, ap, global_step, epoch, speaker_mapping): if args.rank == 0: # Diagnostic visualizations # direct pass on model for spec predictions + target_speaker = None if speaker_ids is None else speaker_ids[:1] if hasattr(model, 'module'): - spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=speaker_ids[:1]) + spec_pred, *_ = model.module.inference(text_input[:1], text_lengths[:1], g=target_speaker) else: - spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=speaker_ids[:1]) + spec_pred, *_ = model.inference(text_input[:1], text_lengths[:1], g=target_speaker) spec_pred = spec_pred.permute(0, 2, 1) gt_spec = mel_input.permute(0, 2, 1) @@ -489,14 +494,6 @@ def main(args): # pylint: disable=redefined-outer-name optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0, betas=(0.9, 0.98), eps=1e-9) criterion = GlowTTSLoss() - if c.apex_amp_level is not None: - # pylint: disable=import-outside-toplevel - from apex import amp - model.cuda() - model, optimizer = amp.initialize(model, optimizer, opt_level=c.apex_amp_level) - else: - amp = None - if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: @@ -513,9 +510,6 @@ def main(args): # pylint: disable=redefined-outer-name model.load_state_dict(model_dict) del model_dict - if amp and 'amp' in checkpoint: - amp.load_state_dict(checkpoint['amp']) - for group in optimizer.param_groups: group['initial_lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], @@ -530,10 +524,7 @@ def main(args): # pylint: disable=redefined-outer-name # DISTRUBUTED if num_gpus > 1: - if c.apex_amp_level is not None: - model = DDP_apex(model) - else: - model = DDP_th(model, device_ids=[args.rank]) + model = DDP_th(model, device_ids=[args.rank]) if c.noam_schedule: scheduler = NoamLR(optimizer, @@ -554,14 +545,14 @@ def main(args): # pylint: disable=redefined-outer-name c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(model, criterion, optimizer, scheduler, ap, global_step, - epoch, amp, speaker_mapping) + epoch, speaker_mapping) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch, speaker_mapping=speaker_mapping) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, - OUT_PATH, amp_state_dict=amp.state_dict() if amp else None) + OUT_PATH) if __name__ == '__main__': @@ -614,8 +605,8 @@ if __name__ == '__main__': check_config_tts(c) _ = os.path.dirname(os.path.realpath(__file__)) - if c.apex_amp_level: - print(" > apex AMP level: ", c.apex_amp_level) + if c.mixed_precision: + print(" > Mixed precision enabled.") OUT_PATH = args.continue_path if args.continue_path == '': diff --git a/TTS/bin/train_wavegrad.py b/TTS/bin/train_wavegrad.py index 83e5d78b..13434979 100644 --- a/TTS/bin/train_wavegrad.py +++ b/TTS/bin/train_wavegrad.py @@ -16,7 +16,8 @@ from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import (KeepAverage, count_parameters, create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict) + remove_experiment_folder, set_init_dict, + set_amp_context) from TTS.utils.io import copy_config_file, load_config from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env @@ -101,7 +102,7 @@ def train(model, criterion, optimizer, model.compute_noise_level(noise_schedule['num_steps'], noise_schedule['min_val'], noise_schedule['max_val']) - scaler = torch.cuda.amp.GradScaler() + scaler = torch.cuda.amp.GradScaler() if c.mixed_precision else None for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -111,7 +112,7 @@ def train(model, criterion, optimizer, global_step += 1 - with torch.cuda.amp.autocast(): + with set_amp_context(c.mixed_precision): # compute noisy input if hasattr(model, 'module'): noise, x_noisy, noise_scale = model.module.compute_y_n(x) @@ -127,7 +128,7 @@ def train(model, criterion, optimizer, # check nan loss if torch.isnan(loss).any(): - raise RuntimeError(f'Detected NaN loss at step {self.step}.') + raise RuntimeError(f'Detected NaN loss at step {global_step}.') optimizer.zero_grad() diff --git a/TTS/tts/layers/tacotron2.py b/TTS/tts/layers/tacotron2.py index 490f3728..a02db784 100644 --- a/TTS/tts/layers/tacotron2.py +++ b/TTS/tts/layers/tacotron2.py @@ -102,7 +102,7 @@ class Encoder(nn.Module): o = layer(o) o = o.transpose(1, 2) o = nn.utils.rnn.pack_padded_sequence(o, - input_lengths, + input_lengths.cpu(), batch_first=True) self.lstm.flatten_parameters() o, _ = self.lstm(o) diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index 2361fa85..d43edcbf 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -248,7 +248,7 @@ def check_config_tts(c): check_argument('use_external_speaker_embedding_file', c, restricted=True if c['use_speaker_embedding'] else False, val_type=bool) check_argument('external_speaker_embedding_file', c, restricted=True if c['use_external_speaker_embedding_file'] else False, val_type=str) check_argument('use_gst', c, restricted=is_tacotron(c), val_type=bool) - if c['use_gst']: + if c['model'].lower() in ['tacotron', 'tacotron2'] and c['use_gst']: check_argument('gst', c, restricted=is_tacotron(c), val_type=dict) check_argument('gst_style_input', c['gst'], restricted=is_tacotron(c), val_type=[str, dict]) check_argument('gst_embedding_dim', c['gst'], restricted=is_tacotron(c), val_type=int, min_val=0, max_val=1000) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 3d2dd13c..cad1d21f 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -210,7 +210,7 @@ def synthesis(model, """ # GST processing style_mel = None - if CONFIG.use_gst and style_wav is not None: + if 'use_gst' in CONFIG.keys() and CONFIG.use_gst and style_wav is not None: if isinstance(style_wav, dict): style_mel = style_wav else: diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index dcfbbdc3..686a3453 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -1,8 +1,19 @@ -import os -import glob -import shutil import datetime +import glob +import os +import shutil import subprocess +from contextlib import nullcontext + +import torch + + +def set_amp_context(mixed_precision): + if mixed_precision: + cm = torch.cuda.amp.autocast() + else: + cm = nullcontext() + return cm def get_git_branch(): diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index a72f2837..c6c20eb5 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -1,11 +1,8 @@ -import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.nn.utils import weight_norm -from math import log as ln - class Conv1d(nn.Conv1d): def __init__(self, *args, **kwargs): From 39c71ee8a98bcbfea242e6b203556150ee64205b Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 29 Oct 2020 15:47:15 +0100 Subject: [PATCH 62/98] wavegrad refactoring, fixing tests for glow-tts and wavegrad --- TTS/vocoder/layers/wavegrad.py | 65 ++++++++++------------------------ TTS/vocoder/models/wavegrad.py | 13 +++++-- run_tests.sh | 6 ++-- tests/test_encoder.py | 4 +-- tests/test_server_package.sh | 6 ++-- tests/test_wavegrad_train.py | 11 +++++- 6 files changed, 49 insertions(+), 56 deletions(-) diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index c6c20eb5..2c781fd6 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -11,31 +11,8 @@ class Conv1d(nn.Conv1d): nn.init.zeros_(self.bias) -# class PositionalEncoding(nn.Module): -# def __init__(self, n_channels): -# super().__init__() -# self.n_channels = n_channels -# self.length = n_channels // 2 -# assert n_channels % 2 == 0 - -# def forward(self, x, noise_level): -# """ -# Shapes: -# x: B x C x T -# noise_level: B -# """ -# return (x + self.encoding(noise_level)[:, :, None]) - -# def encoding(self, noise_level): -# step = torch.arange( -# self.length, dtype=noise_level.dtype, device=noise_level.device) / self.length -# encoding = noise_level.unsqueeze(1) * torch.exp( -# -ln(1e4) * step.unsqueeze(0)) -# encoding = torch.cat([torch.sin(encoding), torch.cos(encoding)], dim=-1) -# return encoding - - class PositionalEncoding(nn.Module): + """Positional encoding with noise level conditioning""" def __init__(self, n_channels, max_len=10000): super().__init__() self.n_channels = n_channels @@ -64,9 +41,7 @@ class FiLM(nn.Module): self.encoding = PositionalEncoding(input_size) self.input_conv = weight_norm(nn.Conv1d(input_size, input_size, 3, padding=1)) self.output_conv = weight_norm(nn.Conv1d(input_size, output_size * 2, 3, padding=1)) - self.ini_parameters() - def ini_parameters(self): nn.init.xavier_uniform_(self.input_conv.weight) nn.init.xavier_uniform_(self.output_conv.weight) nn.init.zeros_(self.input_conv.bias) @@ -120,30 +95,28 @@ class UBlock(nn.Module): ]) def forward(self, x, shift, scale): - block1 = F.interpolate(x, size=x.shape[-1] * self.factor) - block1 = self.block1(block1) + o1 = F.interpolate(x, size=x.shape[-1] * self.factor) + o1 = self.block1(o1) - block2 = F.leaky_relu(x, 0.2) - block2 = F.interpolate(block2, size=x.shape[-1] * self.factor) - block2 = self.block2[0](block2) - # block2 = film_shift + film_scale * block2 - block2 = shif_and_scale(block2, scale, shift) - block2 = F.leaky_relu(block2, 0.2) - block2 = self.block2[1](block2) + o2 = F.leaky_relu(x, 0.2) + o2 = F.interpolate(o2, size=x.shape[-1] * self.factor) + o2 = self.block2[0](o2) + o2 = shif_and_scale(o2, scale, shift) + o2 = F.leaky_relu(o2, 0.2) + o2 = self.block2[1](o2) - x = block1 + block2 + x = o1 + o2 - # block3 = film_shift + film_scale * x - block3 = shif_and_scale(x, scale, shift) - block3 = F.leaky_relu(block3, 0.2) - block3 = self.block3[0](block3) - # block3 = film_shift + film_scale * block3 - block3 = shif_and_scale(block3, scale, shift) - block3 = F.leaky_relu(block3, 0.2) - block3 = self.block3[1](block3) + o3 = shif_and_scale(x, scale, shift) + o3 = F.leaky_relu(o3, 0.2) + o3 = self.block3[0](o3) - x = x + block3 - return x + o3 = shif_and_scale(o3, scale, shift) + o3 = F.leaky_relu(o3, 0.2) + o3 = self.block3[1](o3) + + o = x + o3 + return o class DBlock(nn.Module): diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index cbdb1205..120f0de0 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -20,6 +20,15 @@ class Wavegrad(nn.Module): super().__init__() self.hop_len = np.prod(upsample_factors) + self.noise_level = None + self.num_steps = None + self.beta = None + self.alpha = None + self.alpha_hat = None + self.noise_level = None + self.c1 = None + self.c2 = None + self.sigma = None # dblocks self.dblocks = nn.ModuleList([ @@ -75,6 +84,7 @@ class Wavegrad(nn.Module): def compute_y_n(self, y_0): + """Compute noisy audio based on noise schedule""" self.noise_level = self.noise_level.to(y_0) if len(y_0.shape) == 3: y_0 = y_0.squeeze(1) @@ -87,6 +97,7 @@ class Wavegrad(nn.Module): return noise.unsqueeze(1), noisy_audio.unsqueeze(1), noise_scale[:, 0] def compute_noise_level(self, num_steps, min_val, max_val): + """Compute noise schedule parameters""" beta = np.linspace(min_val, max_val, num_steps) alpha = 1 - beta alpha_hat = np.cumprod(alpha) @@ -101,5 +112,3 @@ class Wavegrad(nn.Module): self.c1 = 1 / self.alpha**0.5 self.c2 = (1 - self.alpha) / (1 - self.alpha_hat)**0.5 self.sigma = ((1.0 - self.alpha_hat[:-1]) / (1.0 - self.alpha_hat[1:]) * self.beta[1:])**0.5 - - diff --git a/run_tests.sh b/run_tests.sh index 27f54b24..998d8ec4 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,12 +1,14 @@ TF_CPP_MIN_LOG_LEVEL=3 # tests -nosetests tests -x &&\ +# nosetests tests -x &&\ # runtime tests ./tests/test_server_package.sh && \ ./tests/test_tts_train.sh && \ -./tests/test_vocoder_train.sh && \ +./tests/test_vocoder_gan_train.sh && \ +./tests/test_vocoder_wavernn_train.sh && \ +./tests/test_glow-tts_train.sh && \ # linter check cardboardlinter --refspec master \ No newline at end of file diff --git a/tests/test_encoder.py b/tests/test_encoder.py index a646eaa6..4d4dbba1 100644 --- a/tests/test_encoder.py +++ b/tests/test_encoder.py @@ -62,7 +62,7 @@ class GE2ELossTests(unittest.TestCase): assert output.item() >= 0.0 # check speaker loss with orthogonal d-vectors dummy_input = T.empty(3, 64) - dummy_input = T.nn.init.orthogonal(dummy_input) + dummy_input = T.nn.init.orthogonal_(dummy_input) dummy_input = T.cat( [ dummy_input[0].repeat(5, 1, 1).transpose(0, 1), @@ -91,7 +91,7 @@ class AngleProtoLossTests(unittest.TestCase): # check speaker loss with orthogonal d-vectors dummy_input = T.empty(3, 64) - dummy_input = T.nn.init.orthogonal(dummy_input) + dummy_input = T.nn.init.orthogonal_(dummy_input) dummy_input = T.cat( [ dummy_input[0].repeat(5, 1, 1).transpose(0, 1), diff --git a/tests/test_server_package.sh b/tests/test_server_package.sh index 83ffc6f0..7e75415a 100755 --- a/tests/test_server_package.sh +++ b/tests/test_server_package.sh @@ -6,12 +6,12 @@ if [[ ! -f tests/outputs/checkpoint_10.pth.tar ]]; then exit 1 fi +rm -f dist/*.whl +python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json + python -m venv /tmp/venv source /tmp/venv/bin/activate pip install --quiet --upgrade pip setuptools wheel - -rm -f dist/*.whl -python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json pip install --quiet dist/TTS*.whl # this is related to https://github.com/librosa/librosa/issues/1160 diff --git a/tests/test_wavegrad_train.py b/tests/test_wavegrad_train.py index 1fd1d10e..d517b66b 100644 --- a/tests/test_wavegrad_train.py +++ b/tests/test_wavegrad_train.py @@ -30,9 +30,18 @@ class WavegradTrainTest(unittest.TestCase): upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]]) + + model_ref = Wavegrad(in_channels=80, + out_channels=1, + upsample_factors=[5, 5, 3, 2, 2], + upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2], + [1, 2, 4, 8], [1, 2, 4, 8], + [1, 2, 4, 8]]) model.train() model.to(device) - model_ref = copy.deepcopy(model) + model.compute_noise_level(1000, 1e-6, 1e-2) + model_ref.load_state_dict(model.state_dict()) + model_ref.to(device) count = 0 for param, param_ref in zip(model.parameters(), model_ref.parameters()): From 73581cd94cea080ffb09be443f3817d2ef3fb7dc Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 29 Oct 2020 16:50:07 +0100 Subject: [PATCH 63/98] renaming train scripts and updating tests --- README.md | 16 +++++++++------- TTS/bin/{train_tts.py => train_tacotron.py} | 13 +++++-------- ...train_gan_vocoder.py => train_vocoder_gan.py} | 0 ...ain_wavegrad.py => train_vocoder_wavegrad.py} | 8 +++----- ...vernn_vocoder.py => train_vocoder_wavernn.py} | 0 TTS/tts/configs/config.json | 9 ++++++--- run_tests.sh | 3 ++- tests/inputs/test_train_config.json | 10 ++++++++++ ...{test_tts_train.sh => test_tacotron_train.sh} | 0 tests/test_vocoder_gan_train.sh | 4 ++-- tests/test_vocoder_wavernn_train.sh | 4 ++-- 11 files changed, 39 insertions(+), 28 deletions(-) rename TTS/bin/{train_tts.py => train_tacotron.py} (98%) rename TTS/bin/{train_gan_vocoder.py => train_vocoder_gan.py} (100%) rename TTS/bin/{train_wavegrad.py => train_vocoder_wavegrad.py} (99%) rename TTS/bin/{train_wavernn_vocoder.py => train_vocoder_wavernn.py} (100%) rename tests/{test_tts_train.sh => test_tacotron_train.sh} (100%) diff --git a/README.md b/README.md index 5b048c42..7488103c 100644 --- a/README.md +++ b/README.md @@ -150,23 +150,25 @@ head -n 12000 metadata_shuf.csv > metadata_train.csv tail -n 1100 metadata_shuf.csv > metadata_val.csv ``` -To train a new model, you need to define your own ```config.json``` file (check the example) and call with the command below. You also set the model architecture in ```config.json```. +To train a new model, you need to define your own ```config.json``` to define model details, trainin configuration and more (check the examples). Then call the corressponding train script. -```python TTS/bin/train_tts.py --config_path TTS/tts/configs/config.json``` +For instance, in order to train a tacotron or tacotron2 model on LJSpeech dataset, follow these steps. + +```python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json``` To fine-tune a model, use ```--restore_path```. -```python TTS/bin/train_tts.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar``` +```python TTS/bin/train_tacotron.py --config_path TTS/tts/configs/config.json --restore_path /path/to/your/model.pth.tar``` To continue an old training run, use ```--continue_path```. -```python TTS/bin/train_tts.py --continue_path /path/to/your/run_folder/``` +```python TTS/bin/train_tacotron.py --continue_path /path/to/your/run_folder/``` -For multi-GPU training use ```distribute.py```. It enables process based multi-GPU training where each process uses a single GPU. +For multi-GPU training, call ```distribute.py```. It runs any provided train script in multi-GPU setting. -```CUDA_VISIBLE_DEVICES="0,1,4" TTS/bin/distribute.py --script train_tts.py --config_path TTS/tts/configs/config.json``` +```CUDA_VISIBLE_DEVICES="0,1,4" python TTS/bin/distribute.py --script train_tacotron.py --config_path TTS/tts/configs/config.json``` -Each run creates a new output folder and ```config.json``` is copied under this folder. +Each run creates a new output folder accomodating used ```config.json```, model checkpoints and tensorboard logs. In case of any error or intercepted execution, if there is no checkpoint yet under the output folder, the whole folder is going to be removed. diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tacotron.py similarity index 98% rename from TTS/bin/train_tts.py rename to TTS/bin/train_tacotron.py index 8029ab21..dd9f0e55 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tacotron.py @@ -7,27 +7,25 @@ import os import sys import time import traceback +from random import randrange import numpy as np import torch - -from random import randrange from torch.utils.data import DataLoader from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import TacotronLoss -from TTS.tts.utils.distribute import (DistributedSampler, - apply_gradient_allreduce, - init_distributed, reduce_tensor) -from TTS.tts.utils.generic_utils import setup_model, check_config_tts +from TTS.tts.utils.generic_utils import check_config_tts, setup_model from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score -from TTS.tts.utils.speakers import parse_speakers, load_speaker_mapping +from TTS.tts.utils.speakers import load_speaker_mapping, parse_speakers from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.distribute import (DistributedSampler, apply_gradient_allreduce, + init_distributed, reduce_tensor) from TTS.utils.generic_utils import (KeepAverage, count_parameters, create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) @@ -38,7 +36,6 @@ from TTS.utils.training import (NoamLR, adam_weight_decay, check_update, gradual_training_scheduler, set_weight_decay, setup_torch_training_env) - use_cuda, num_gpus = setup_torch_training_env(True, False) diff --git a/TTS/bin/train_gan_vocoder.py b/TTS/bin/train_vocoder_gan.py similarity index 100% rename from TTS/bin/train_gan_vocoder.py rename to TTS/bin/train_vocoder_gan.py diff --git a/TTS/bin/train_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py similarity index 99% rename from TTS/bin/train_wavegrad.py rename to TTS/bin/train_vocoder_wavegrad.py index 13434979..96191569 100644 --- a/TTS/bin/train_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -132,10 +132,6 @@ def train(model, criterion, optimizer, optimizer.zero_grad() - # schedule update - if scheduler is not None: - scheduler.step() - # backward pass with loss scaling if c.mixed_precision: scaler.scale(loss).backward() @@ -150,7 +146,9 @@ def train(model, criterion, optimizer, c.clip_grad) optimizer.step() - + # schedule update + if scheduler is not None: + scheduler.step() # disconnect loss values loss_dict = dict() diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_vocoder_wavernn.py similarity index 100% rename from TTS/bin/train_wavernn_vocoder.py rename to TTS/bin/train_vocoder_wavernn.py diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json index 1b63b037..55f9306c 100644 --- a/TTS/tts/configs/config.json +++ b/TTS/tts/configs/config.json @@ -68,11 +68,14 @@ "apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. // LOSS SETTINGS - "loss_masking": true, // enable / disable loss masking against the sequence padding. + "loss_masking": false, // enable / disable loss masking against the sequence padding. "decoder_loss_alpha": 0.5, // decoder loss weight. If > 0, it is enabled "postnet_loss_alpha": 0.25, // postnet loss weight. If > 0, it is enabled - "ga_alpha": 5.0, // weight for guided attention loss. If > 0, guided attention is enabled. - "diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // differential spectral loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled // VALIDATION "run_eval": true, diff --git a/run_tests.sh b/run_tests.sh index 998d8ec4..46f18f01 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -6,9 +6,10 @@ TF_CPP_MIN_LOG_LEVEL=3 # runtime tests ./tests/test_server_package.sh && \ ./tests/test_tts_train.sh && \ +./tests/test_glow-tts_train.sh && \ ./tests/test_vocoder_gan_train.sh && \ ./tests/test_vocoder_wavernn_train.sh && \ -./tests/test_glow-tts_train.sh && \ +./tests/test_vocoder_wavegrad_train.sh && \ # linter check cardboardlinter --refspec master \ No newline at end of file diff --git a/tests/inputs/test_train_config.json b/tests/inputs/test_train_config.json index ddb71384..2e2d6d46 100644 --- a/tests/inputs/test_train_config.json +++ b/tests/inputs/test_train_config.json @@ -74,6 +74,16 @@ "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + // LOSS SETTINGS + "loss_masking": false, // enable / disable loss masking against the sequence padding. + "decoder_loss_alpha": 0.5, // decoder loss weight. If > 0, it is enabled + "postnet_loss_alpha": 0.25, // postnet loss weight. If > 0, it is enabled + "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. + "decoder_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "postnet_diff_spec_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + "decoder_ssim_alpha": 0.5, // differential spectral loss weight. If > 0, it is enabled + "postnet_ssim_alpha": 0.25, // differential spectral loss weight. If > 0, it is enabled + // OPTIMIZER "noam_schedule": false, // use noam warmup and lr schedule. "grad_clip": 1.0, // upper limit for gradients for clipping. diff --git a/tests/test_tts_train.sh b/tests/test_tacotron_train.sh similarity index 100% rename from tests/test_tts_train.sh rename to tests/test_tacotron_train.sh diff --git a/tests/test_vocoder_gan_train.sh b/tests/test_vocoder_gan_train.sh index 75773cc3..474ef9a7 100755 --- a/tests/test_vocoder_gan_train.sh +++ b/tests/test_vocoder_gan_train.sh @@ -5,11 +5,11 @@ echo "$BASEDIR" # create run dir mkdir $BASEDIR/train_outputs # run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_gan.py --config_path $BASEDIR/inputs/test_vocoder_multiband_melgan_config.json # find the training folder LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) echo $LATEST_FOLDER # continue the previous training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_gan_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_gan.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER # remove all the outputs rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER diff --git a/tests/test_vocoder_wavernn_train.sh b/tests/test_vocoder_wavernn_train.sh index f2e32116..ffa30d40 100755 --- a/tests/test_vocoder_wavernn_train.sh +++ b/tests/test_vocoder_wavernn_train.sh @@ -5,11 +5,11 @@ echo "$BASEDIR" # create run dir mkdir $BASEDIR/train_outputs # run training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavernn.py --config_path $BASEDIR/inputs/test_vocoder_wavernn_config.json # find the training folder LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) echo $LATEST_FOLDER # continue the previous training -CUDA_VISIBLE_DEVICES="" python TTS/bin/train_wavernn_vocoder.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavernn.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER # remove all the outputs rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER \ No newline at end of file From 750a38f54555ce410c85418aebdd738b0ce9b3dc Mon Sep 17 00:00:00 2001 From: erogol Date: Thu, 29 Oct 2020 23:51:34 +0100 Subject: [PATCH 64/98] readme update --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7488103c..472b504b 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ Vocoders: - MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106) - GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646) - WaveRNN: [origin][https://github.com/fatchord/WaveRNN/] +- WaveGrad: [paper][https://arxiv.org/abs/2009.00713] You can also help us implement more models. Some TTS related work can be found [here](https://github.com/erogol/TTS-papers). From a44ef58aea7820aa3ac9ae064e8b45d15653d6a5 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 30 Oct 2020 13:23:24 +0100 Subject: [PATCH 65/98] wavegrad weight norm refactoring --- TTS/vocoder/configs/wavegrad_libritts.json | 1 + TTS/vocoder/layers/wavegrad.py | 133 +++++++++++++-------- TTS/vocoder/models/wavegrad.py | 68 +++++++++-- TTS/vocoder/utils/generic_utils.py | 1 + tests/test_wavegrad_layers.py | 12 ++ 5 files changed, 156 insertions(+), 59 deletions(-) diff --git a/TTS/vocoder/configs/wavegrad_libritts.json b/TTS/vocoder/configs/wavegrad_libritts.json index 5720a482..57c26709 100644 --- a/TTS/vocoder/configs/wavegrad_libritts.json +++ b/TTS/vocoder/configs/wavegrad_libritts.json @@ -45,6 +45,7 @@ // MODEL PARAMETERS "generator_model": "wavegrad", "model_params":{ + "use_weight_norm": true, "y_conv_channels":32, "x_conv_channels":768, "ublock_out_channels": [512, 512, 256, 128, 128], diff --git a/TTS/vocoder/layers/wavegrad.py b/TTS/vocoder/layers/wavegrad.py index 2c781fd6..d09b4950 100644 --- a/TTS/vocoder/layers/wavegrad.py +++ b/TTS/vocoder/layers/wavegrad.py @@ -39,8 +39,8 @@ class FiLM(nn.Module): def __init__(self, input_size, output_size): super().__init__() self.encoding = PositionalEncoding(input_size) - self.input_conv = weight_norm(nn.Conv1d(input_size, input_size, 3, padding=1)) - self.output_conv = weight_norm(nn.Conv1d(input_size, output_size * 2, 3, padding=1)) + self.input_conv = nn.Conv1d(input_size, input_size, 3, padding=1) + self.output_conv = nn.Conv1d(input_size, output_size * 2, 3, padding=1) nn.init.xavier_uniform_(self.input_conv.weight) nn.init.xavier_uniform_(self.output_conv.weight) @@ -48,12 +48,20 @@ class FiLM(nn.Module): nn.init.zeros_(self.output_conv.bias) def forward(self, x, noise_scale): - x = self.input_conv(x) - x = F.leaky_relu(x, 0.2) - x = self.encoding(x, noise_scale) - shift, scale = torch.chunk(self.output_conv(x), 2, dim=1) + o = self.input_conv(x) + o = F.leaky_relu(o, 0.2) + o = self.encoding(o, noise_scale) + shift, scale = torch.chunk(self.output_conv(o), 2, dim=1) return shift, scale + def remove_weight_norm(self): + nn.utils.remove_weight_norm(self.input_conv) + nn.utils.remove_weight_norm(self.output_conv) + + def apply_weight_norm(self): + self.input_conv = weight_norm(self.input_conv) + self.output_conv = weight_norm(self.output_conv) + @torch.jit.script def shif_and_scale(x, scale, shift): @@ -68,79 +76,100 @@ class UBlock(nn.Module): assert len(dilation) == 4 self.factor = factor - self.block1 = weight_norm(Conv1d(input_size, hidden_size, 1)) - self.block2 = nn.ModuleList([ - weight_norm(Conv1d(input_size, + self.res_block = Conv1d(input_size, hidden_size, 1) + self.main_block = nn.ModuleList([ + Conv1d(input_size, hidden_size, 3, dilation=dilation[0], - padding=dilation[0])), - weight_norm(Conv1d(hidden_size, + padding=dilation[0]), + Conv1d(hidden_size, hidden_size, 3, dilation=dilation[1], - padding=dilation[1])) + padding=dilation[1]) ]) - self.block3 = nn.ModuleList([ - weight_norm(Conv1d(hidden_size, + self.out_block = nn.ModuleList([ + Conv1d(hidden_size, hidden_size, 3, dilation=dilation[2], - padding=dilation[2])), - weight_norm(Conv1d(hidden_size, + padding=dilation[2]), + Conv1d(hidden_size, hidden_size, 3, dilation=dilation[3], - padding=dilation[3])) + padding=dilation[3]) ]) def forward(self, x, shift, scale): - o1 = F.interpolate(x, size=x.shape[-1] * self.factor) - o1 = self.block1(o1) - - o2 = F.leaky_relu(x, 0.2) - o2 = F.interpolate(o2, size=x.shape[-1] * self.factor) - o2 = self.block2[0](o2) - o2 = shif_and_scale(o2, scale, shift) - o2 = F.leaky_relu(o2, 0.2) - o2 = self.block2[1](o2) - - x = o1 + o2 - - o3 = shif_and_scale(x, scale, shift) - o3 = F.leaky_relu(o3, 0.2) - o3 = self.block3[0](o3) - - o3 = shif_and_scale(o3, scale, shift) - o3 = F.leaky_relu(o3, 0.2) - o3 = self.block3[1](o3) - - o = x + o3 + x_inter = F.interpolate(x, size=x.shape[-1] * self.factor) + res = self.res_block(x_inter) + o = F.leaky_relu(x_inter, 0.2) + o = F.interpolate(o, size=x.shape[-1] * self.factor) + o = self.main_block[0](o) + o = shif_and_scale(o, scale, shift) + o = F.leaky_relu(o, 0.2) + o = self.main_block[1](o) + res2 = res + o + o = shif_and_scale(res2, scale, shift) + o = F.leaky_relu(o, 0.2) + o = self.out_block[0](o) + o = shif_and_scale(o, scale, shift) + o = F.leaky_relu(o, 0.2) + o = self.out_block[1](o) + o = o + res2 return o + def remove_weight_norm(self): + nn.utils.remove_weight_norm(self.res_block) + for _, layer in enumerate(self.main_block): + if len(layer.state_dict()) != 0: + nn.utils.remove_weight_norm(layer) + for _, layer in enumerate(self.out_block): + if len(layer.state_dict()) != 0: + nn.utils.remove_weight_norm(layer) + + def apply_weight_norm(self): + self.res_block = weight_norm(self.res_block) + for idx, layer in enumerate(self.main_block): + if len(layer.state_dict()) != 0: + self.main_block[idx] = weight_norm(layer) + for idx, layer in enumerate(self.out_block): + if len(layer.state_dict()) != 0: + self.out_block[idx] = weight_norm(layer) + class DBlock(nn.Module): def __init__(self, input_size, hidden_size, factor): super().__init__() self.factor = factor - self.residual_dense = weight_norm(Conv1d(input_size, hidden_size, 1)) - self.conv = nn.ModuleList([ - weight_norm(Conv1d(input_size, hidden_size, 3, dilation=1, padding=1)), - weight_norm(Conv1d(hidden_size, hidden_size, 3, dilation=2, padding=2)), - weight_norm(Conv1d(hidden_size, hidden_size, 3, dilation=4, padding=4)), + self.res_block = Conv1d(input_size, hidden_size, 1) + self.main_block = nn.ModuleList([ + Conv1d(input_size, hidden_size, 3, dilation=1, padding=1), + Conv1d(hidden_size, hidden_size, 3, dilation=2, padding=2), + Conv1d(hidden_size, hidden_size, 3, dilation=4, padding=4), ]) def forward(self, x): size = x.shape[-1] // self.factor + res = self.res_block(x) + res = F.interpolate(res, size=size) + o = F.interpolate(x, size=size) + for layer in self.main_block: + o = F.leaky_relu(o, 0.2) + o = layer(o) + return o + res - residual = self.residual_dense(x) - residual = F.interpolate(residual, size=size) - - x = F.interpolate(x, size=size) - for layer in self.conv: - x = F.leaky_relu(x, 0.2) - x = layer(x) - - return x + residual + def remove_weight_norm(self): + nn.utils.remove_weight_norm(self.res_block) + for _, layer in enumerate(self.main_block): + if len(layer.state_dict()) != 0: + nn.utils.remove_weight_norm(layer) + def apply_weight_norm(self): + self.res_block = weight_norm(self.res_block) + for idx, layer in enumerate(self.main_block): + if len(layer.state_dict()) != 0: + self.main_block[idx] = weight_norm(layer) diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index 9dc2193c..1130eb47 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -1,6 +1,7 @@ import numpy as np import torch from torch import nn +from torch.nn.utils import weight_norm from ..layers.wavegrad import DBlock, FiLM, UBlock, Conv1d @@ -10,6 +11,7 @@ class Wavegrad(nn.Module): def __init__(self, in_channels=80, out_channels=1, + use_weight_norm=False, y_conv_channels=32, x_conv_channels=768, dblock_out_channels=[128, 128, 256, 512], @@ -19,6 +21,7 @@ class Wavegrad(nn.Module): [1, 2, 4, 8], [1, 2, 4, 8]]): super().__init__() + self.use_weight_norm = use_weight_norm self.hop_len = np.prod(upsample_factors) self.noise_level = None self.num_steps = None @@ -31,9 +34,8 @@ class Wavegrad(nn.Module): self.sigma = None # dblocks - self.dblocks = nn.ModuleList([ - Conv1d(1, y_conv_channels, 5, padding=2), - ]) + self.y_conv = Conv1d(1, y_conv_channels, 5, padding=2) + self.dblocks = nn.ModuleList([]) ic = y_conv_channels for oc, df in zip(dblock_out_channels, reversed(upsample_factors)): self.dblocks.append(DBlock(ic, oc, df)) @@ -56,15 +58,22 @@ class Wavegrad(nn.Module): self.x_conv = Conv1d(in_channels, x_conv_channels, 3, padding=1) self.out_conv = Conv1d(oc, out_channels, 3, padding=1) + if use_weight_norm: + self.apply_weight_norm() + def forward(self, x, spectrogram, noise_scale): - downsampled = [] - for film, layer in zip(self.film, self.dblocks): + shift_and_scale = [] + + x = self.y_conv(x) + shift_and_scale.append(self.film[0](x, noise_scale)) + + for film, layer in zip(self.film[1:], self.dblocks): x = layer(x) - downsampled.append(film(x, noise_scale)) + shift_and_scale.append(film(x, noise_scale)) x = self.x_conv(spectrogram) for layer, (film_shift, film_scale) in zip(self.ublocks, - reversed(downsampled)): + reversed(shift_and_scale)): x = layer(x, film_shift, film_scale) x = self.out_conv(x) return x @@ -113,3 +122,48 @@ class Wavegrad(nn.Module): self.c1 = 1 / self.alpha**0.5 self.c2 = (1 - self.alpha) / (1 - self.alpha_hat)**0.5 self.sigma = ((1.0 - self.alpha_hat[:-1]) / (1.0 - self.alpha_hat[1:]) * self.beta[1:])**0.5 + + def remove_weight_norm(self): + for _, layer in enumerate(self.dblocks): + if len(layer.state_dict()) != 0: + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + layer.remove_weight_norm() + + for _, layer in enumerate(self.film): + if len(layer.state_dict()) != 0: + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + layer.remove_weight_norm() + + + for _, layer in enumerate(self.ublocks): + if len(layer.state_dict()) != 0: + try: + nn.utils.remove_weight_norm(layer) + except ValueError: + layer.remove_weight_norm() + + nn.utils.remove_weight_norm(self.x_conv) + nn.utils.remove_weight_norm(self.out_conv) + nn.utils.remove_weight_norm(self.y_conv) + + def apply_weight_norm(self): + for _, layer in enumerate(self.dblocks): + if len(layer.state_dict()) != 0: + layer.apply_weight_norm() + + for _, layer in enumerate(self.film): + if len(layer.state_dict()) != 0: + layer.apply_weight_norm() + + + for _, layer in enumerate(self.ublocks): + if len(layer.state_dict()) != 0: + layer.apply_weight_norm() + + self.x_conv = weight_norm(self.x_conv) + self.out_conv = weight_norm(self.out_conv) + self.y_conv = weight_norm(self.y_conv) diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 761b14d7..d6e2e13b 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -118,6 +118,7 @@ def setup_generator(c): model = MyModel( in_channels=c['audio']['num_mels'], out_channels=1, + use_weight_norm=c['model_params']['use_weight_norm'], x_conv_channels=c['model_params']['x_conv_channels'], y_conv_channels=c['model_params']['y_conv_channels'], dblock_out_channels=c['model_params']['dblock_out_channels'], diff --git a/tests/test_wavegrad_layers.py b/tests/test_wavegrad_layers.py index a1c6a7e5..d81ae47d 100644 --- a/tests/test_wavegrad_layers.py +++ b/tests/test_wavegrad_layers.py @@ -32,6 +32,9 @@ def test_film(): assert scale.shape[2] == 100 assert isinstance(scale, torch.FloatTensor) + layer.apply_weight_norm() + layer.remove_weight_norm() + def test_ublock(): inp1 = torch.rand(32, 50, 100) @@ -49,6 +52,9 @@ def test_ublock(): assert o.shape[2] == 100 assert isinstance(o, torch.FloatTensor) + layer.apply_weight_norm() + layer.remove_weight_norm() + def test_dblock(): inp = torch.rand(32, 50, 130) @@ -60,6 +66,9 @@ def test_dblock(): assert o.shape[2] == 65 assert isinstance(o, torch.FloatTensor) + layer.apply_weight_norm() + layer.remove_weight_norm() + def test_wavegrad_forward(): x = torch.rand(32, 1, 20 * 300) @@ -78,3 +87,6 @@ def test_wavegrad_forward(): assert o.shape[1] == 1 assert o.shape[2] == 20 * 300 assert isinstance(o, torch.FloatTensor) + + model.apply_weight_norm() + model.remove_weight_norm() From ef04d7fae7331897db6ac38069de3095538714e3 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 30 Oct 2020 14:08:41 +0100 Subject: [PATCH 66/98] bug fix for wavernn training --- TTS/bin/train_vocoder_wavernn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index acc4b703..8d563217 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -509,7 +509,7 @@ if __name__ == "__main__": new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() copy_config_file( - args.config_path, os.path.join(OUT_PATH, "c.json"), new_fields + args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) From b8ac9aba9dafa59a0755c9e1e2d9ae540fe27726 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 2 Nov 2020 12:44:41 +0100 Subject: [PATCH 67/98] check against NaN loss in tacotron_loss --- .compute | 6 +++--- TTS/tts/layers/losses.py | 7 ++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.compute b/.compute index 02588f5b..cda787d2 100644 --- a/.compute +++ b/.compute @@ -1,14 +1,14 @@ #!/bin/bash yes | apt-get install sox yes | apt-get install ffmpeg -yes | apt-get install espeak +yes | apt-get install espeak yes | apt-get install tmux yes | apt-get install zsh sh -c "$(curl -fsSL https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)" pip3 install https://download.pytorch.org/whl/cu100/torch-1.3.0%2Bcu100-cp36-cp36m-linux_x86_64.whl sudo sh install.sh -pip install pytorch==1.3.0+cu100 -python3 setup.py develop +# pip install pytorch==1.7.0+cu100 +# python3 setup.py develop # python3 distribute.py --config_path config.json --data_path /data/ro/shared/data/keithito/LJSpeech-1.1/ # cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ # python3 distribute.py --config_path config_tacotron_gst.json --data_path ../tmp/Mozilla_22050/ diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index f26cb884..c1bc85b5 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -228,7 +228,7 @@ class GuidedAttentionLoss(torch.nn.Module): @staticmethod def _make_ga_mask(ilen, olen, sigma): - grid_x, grid_y = torch.meshgrid(torch.arange(olen), torch.arange(ilen)) + grid_x, grid_y = torch.meshgrid(torch.arange(olen).to(olen), torch.arange(ilen).to(ilen)) grid_x, grid_y = grid_x.float(), grid_y.float() return 1.0 - torch.exp(-(grid_y / ilen - grid_x / olen)**2 / (2 * (sigma**2))) @@ -373,6 +373,11 @@ class TacotronLoss(torch.nn.Module): return_dict['postnet_ssim_loss'] = postnet_ssim_loss return_dict['loss'] = loss + + # check if any loss is NaN + for key, loss in return_dict.items(): + if torch.isnan(loss): + raise RuntimeError(f" [!] NaN loss with {key}.") return return_dict From a108d0ee812ef9458276afb5cb868b8a2be9a87e Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 2 Nov 2020 13:12:19 +0100 Subject: [PATCH 68/98] check nan loss in glow-tts loss --- TTS/tts/layers/losses.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index c1bc85b5..50809de5 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -402,4 +402,9 @@ class GlowTTSLoss(torch.nn.Module): return_dict['loss'] = log_mle + loss_dur return_dict['log_mle'] = log_mle return_dict['loss_dur'] = loss_dur + + # check if any loss is NaN + for key, loss in return_dict.items(): + if torch.isnan(loss): + raise RuntimeError(f" [!] NaN loss with {key}.") return return_dict \ No newline at end of file From d94782a076662f906026572ea2744549e549c7bb Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 2 Nov 2020 13:18:56 +0100 Subject: [PATCH 69/98] reset the way ga_loss is stored in return_dict --- TTS/tts/layers/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index 50809de5..bafa9440 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -346,7 +346,7 @@ class TacotronLoss(torch.nn.Module): if self.config.ga_alpha > 0: ga_loss = self.criterion_ga(alignments, input_lens, alignment_lens) loss += ga_loss * self.ga_alpha - return_dict['ga_loss'] = ga_loss * self.ga_alpha + return_dict['ga_loss'] = ga_loss # decoder differential spectral loss if self.config.decoder_diff_spec_alpha > 0: From c80225544e2fb43abbccd94148cc2045d95f8f63 Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 6 Nov 2020 13:04:46 +0100 Subject: [PATCH 70/98] tune wavegrad to fine the best noise schedule for inferece --- TTS/bin/tune_wavegrad.py | 89 ++++++++++++++++++++++++ TTS/utils/audio.py | 2 +- TTS/vocoder/datasets/wavegrad_dataset.py | 31 +++++++-- TTS/vocoder/models/wavegrad.py | 20 ++++-- 4 files changed, 130 insertions(+), 12 deletions(-) create mode 100644 TTS/bin/tune_wavegrad.py diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py new file mode 100644 index 00000000..ef971dfa --- /dev/null +++ b/TTS/bin/tune_wavegrad.py @@ -0,0 +1,89 @@ +"""Search a good noise schedule for WaveGrad for a given number of inferece iterations""" +import argparse +from itertools import product as cartesian_product + +import numpy as np +import torch +from torch.utils.data import DataLoader +from tqdm import tqdm +from TTS.utils.audio import AudioProcessor +from TTS.utils.io import load_config +from TTS.vocoder.datasets.preprocess import load_wav_data +from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset +from TTS.vocoder.models.wavegrad import Wavegrad +from TTS.vocoder.utils.generic_utils import setup_generator + +parser = argparse.ArgumentParser() +parser.add_argument('--model_path', type=str, help='Path to model checkpoint.') +parser.add_argument('--config_path', type=str, help='Path to model config file.') +parser.add_argument('--data_path', type=str, help='Path to data directory.') +parser.add_argument('--output_path', type=str, help='path for output file including file name and extension.') +parser.add_argument('--num_iter', type=int, help='Number of model inference iterations that you like to optimize noise schedule for.') +parser.add_argument('--use_cuda', type=bool, help='enable/disable CUDA.') +parser.add_argument('--num_samples', type=int, default=1, help='Number of datasamples used for inference.') +parser.add_argument('--search_depth', type=int, default=3, help='Search granularity. Increasing this increases the run-time exponentially.') + +# load config +args = parser.parse_args() +config = load_config(args.config_path) + +# setup audio processor +ap = AudioProcessor(**config.audio) + +# load dataset +_, train_data = load_wav_data(args.data_path, 0) +train_data = train_data[:args.num_samples] +dataset = WaveGradDataset(ap=ap, + items=train_data, + seq_len=ap.hop_length * 100, + hop_len=ap.hop_length, + pad_short=config.pad_short, + conv_pad=config.conv_pad, + is_training=True, + return_segments=False, + use_noise_augment=False, + use_cache=False, + verbose=True) +loader = DataLoader( + dataset, + batch_size=1, + shuffle=False, + collate_fn=dataset.collate_full_clips, + drop_last=False, + num_workers=config.num_loader_workers, + pin_memory=False) + +# setup the model +model = setup_generator(config) +if args.use_cuda: + model.cuda() + +# setup optimization parameters +base_values = sorted(np.random.uniform(high=10, size=args.search_depth)) +best_error = float('inf') +best_schedule = None +total_search_iter = len(base_values)**args.num_iter +for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter): + model.compute_noise_level(num_steps=args.num_iter, min_val=1e-6, max_val=1e-1, base_vals=base) + for data in loader: + mel, audio = data + y_hat = model.inference(mel.cuda() if args.use_cuda else mel) + + if args.use_cuda: + y_hat = y_hat.cpu() + y_hat = y_hat.numpy() + + mel_hat = [] + for i in range(y_hat.shape[0]): + m = ap.melspectrogram(y_hat[i, 0])[:, :-1] + mel_hat.append(torch.from_numpy(m)) + + mel_hat = torch.stack(mel_hat) + mse = torch.sum((mel - mel_hat) ** 2) + if mse.item() < best_error: + best_error = mse.item() + best_schedule = {'num_steps': args.num_iter, 'min_val':1e-6, 'max_val':1e-1, 'base_vals':base} + print(" > Found a better schedule.") + np.save(args.output_path, best_schedule) + + diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index aaa14dfd..26b65bed 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -174,7 +174,7 @@ class AudioProcessor(object): for key in stats_config.keys(): if key in skip_parameters: continue - if key != 'sample_rate': + if key not in ['sample_rate', 'trim_db']: assert stats_config[key] == self.__dict__[key],\ f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" return mel_mean, mel_std, linear_mean, linear_std, stats_config diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index c7b07b0d..83244c89 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -81,11 +81,12 @@ class WaveGradDataset(Dataset): else: audio = self.ap.load_wav(wavpath) - # correct audio length wrt segment length - if audio.shape[-1] < self.seq_len + self.pad_short: - audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \ - mode='constant', constant_values=0.0) - assert audio.shape[-1] >= self.seq_len + self.pad_short, f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" + if self.return_segments: + # correct audio length wrt segment length + if audio.shape[-1] < self.seq_len + self.pad_short: + audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \ + mode='constant', constant_values=0.0) + assert audio.shape[-1] >= self.seq_len + self.pad_short, f"{audio.shape[-1]} vs {self.seq_len + self.pad_short}" # correct the audio length wrt hop length p = (audio.shape[-1] // self.hop_len + 1) * self.hop_len - audio.shape[-1] @@ -104,8 +105,26 @@ class WaveGradDataset(Dataset): audio = audio + (1 / 32768) * torch.randn_like(audio) mel = self.ap.melspectrogram(audio) - mel = mel[..., :-1] + mel = mel[..., :-1] # ignore the padding audio = torch.from_numpy(audio).float() mel = torch.from_numpy(mel).float().squeeze(0) return (mel, audio) + + + def collate_full_clips(self, batch): + """This is used in tune_wavegrad.py. + It pads sequences to the max length.""" + max_mel_length = max([b[0].shape[1] for b in batch]) if len(batch) > 1 else batch[0][0].shape[1] + max_audio_length = max([b[1].shape[0] for b in batch]) if len(batch) > 1 else batch[0][1].shape[0] + + mels = torch.zeros([len(batch), batch[0][0].shape[0], max_mel_length]) + audios = torch.zeros([len(batch), max_audio_length]) + + for idx, b in enumerate(batch): + mel = b[0] + audio = b[1] + mels[idx, :, :mel.shape[1]] = mel + audios[idx, :audio.shape[0]] = audio + + return mels, audios diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index 1130eb47..f6087395 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -78,13 +78,21 @@ class Wavegrad(nn.Module): x = self.out_conv(x) return x + def load_noise_schedule(self, path): + sched = np.load(path, allow_pickle=True).item() + self.compute_noise_level(**sched) + @torch.no_grad() - def inference(self, x): - y_n = torch.randn(x.shape[0], 1, self.hop_len * x.shape[-1], dtype=torch.float32).to(x) - sqrt_alpha_hat = self.noise_level.unsqueeze(1).to(x) + def inference(self, x, y_n=None): + """ x: B x D X T """ + if y_n is None: + y_n = torch.randn(x.shape[0], 1, self.hop_len * x.shape[-1], dtype=torch.float32).to(x) + else: + y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0).to(x) + sqrt_alpha_hat = self.noise_level.to(x) for n in range(len(self.alpha) - 1, -1, -1): y_n = self.c1[n] * (y_n - - self.c2[n] * self.forward(y_n, x, sqrt_alpha_hat[n]).squeeze(1)) + self.c2[n] * self.forward(y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0]))) if n > 0: z = torch.randn_like(y_n) y_n += self.sigma[n - 1] * z @@ -105,9 +113,11 @@ class Wavegrad(nn.Module): noisy_audio = noise_scale * y_0 + (1.0 - noise_scale**2)**0.5 * noise return noise.unsqueeze(1), noisy_audio.unsqueeze(1), noise_scale[:, 0] - def compute_noise_level(self, num_steps, min_val, max_val): + def compute_noise_level(self, num_steps, min_val, max_val, base_vals=None): """Compute noise schedule parameters""" beta = np.linspace(min_val, max_val, num_steps) + if base_vals is not None: + beta *= base_vals alpha = 1 - beta alpha_hat = np.cumprod(alpha) noise_level = np.concatenate([[1.0], alpha_hat ** 0.5], axis=0) From ea976b0543c7fa97628c41c4a936e3113896d18a Mon Sep 17 00:00:00 2001 From: erogol Date: Fri, 6 Nov 2020 13:34:11 +0100 Subject: [PATCH 71/98] python compat update for contextlib --- TTS/utils/generic_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 686a3453..9ed2e20e 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -3,7 +3,7 @@ import glob import os import shutil import subprocess -from contextlib import nullcontext +import contextlib import torch @@ -12,7 +12,10 @@ def set_amp_context(mixed_precision): if mixed_precision: cm = torch.cuda.amp.autocast() else: - cm = nullcontext() + if os.python.version<=3.6: + cm = contextlib.suppress() + else: + cm = nullcontext() return cm From c76a6170726c3314f9930160096b54b99d190cf8 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 9 Nov 2020 13:18:35 +0100 Subject: [PATCH 72/98] linter updates --- TTS/bin/tune_wavegrad.py | 1 - TTS/speaker_encoder/model.py | 1 + TTS/vocoder/datasets/wavegrad_dataset.py | 4 ++-- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py index ef971dfa..375e1f1c 100644 --- a/TTS/bin/tune_wavegrad.py +++ b/TTS/bin/tune_wavegrad.py @@ -10,7 +10,6 @@ from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset -from TTS.vocoder.models.wavegrad import Wavegrad from TTS.vocoder.utils.generic_utils import setup_generator parser = argparse.ArgumentParser() diff --git a/TTS/speaker_encoder/model.py b/TTS/speaker_encoder/model.py index df0527bc..322ee42f 100644 --- a/TTS/speaker_encoder/model.py +++ b/TTS/speaker_encoder/model.py @@ -61,6 +61,7 @@ class SpeakerEncoder(nn.Module): d = torch.nn.functional.normalize(d, p=2, dim=1) return d + @torch.no_grad() def inference(self, x): d = self.layers.forward(x) if self.use_lstm_with_projection: diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 83244c89..30cf9cb3 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -111,8 +111,8 @@ class WaveGradDataset(Dataset): mel = torch.from_numpy(mel).float().squeeze(0) return (mel, audio) - - def collate_full_clips(self, batch): + @staticmethod + def collate_full_clips(batch): """This is used in tune_wavegrad.py. It pads sequences to the max length.""" max_mel_length = max([b[0].shape[1] for b in batch]) if len(batch) > 1 else batch[0][0].shape[1] From 116e2299b05486248fb2b7285b0e25cba10fd0be Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 9 Nov 2020 13:30:42 +0100 Subject: [PATCH 73/98] adding more tests and refactoring --- notebooks/PlotUmapLibriTTS.ipynb | 325 ++++++++++++++++++++++++ run_tests.sh | 2 +- tests/inputs/test_glow_tts.json | 134 ++++++++++ tests/inputs/test_vocoder_wavegrad.json | 113 ++++++++ tests/test_glow-tts_train.sh | 13 + tests/test_tts_train.sh | 13 + tests/test_vocoder_wavegrad_train.sh | 15 ++ 7 files changed, 614 insertions(+), 1 deletion(-) create mode 100644 notebooks/PlotUmapLibriTTS.ipynb create mode 100644 tests/inputs/test_glow_tts.json create mode 100644 tests/inputs/test_vocoder_wavegrad.json create mode 100755 tests/test_glow-tts_train.sh create mode 100755 tests/test_tts_train.sh create mode 100755 tests/test_vocoder_wavegrad_train.sh diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/PlotUmapLibriTTS.ipynb new file mode 100644 index 00000000..46e1ac67 --- /dev/null +++ b/notebooks/PlotUmapLibriTTS.ipynb @@ -0,0 +1,325 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Overview\n", + "\n", + "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n", + "\n", + "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import glob\n", + "import random\n", + "import numpy as np\n", + "import torch\n", + "import umap\n", + "\n", + "from TTS.speaker_encoder.model import SpeakerEncoder\n", + "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.tts.utils.generic_utils import load_config\n", + "\n", + "from bokeh.io import output_notebook, show\n", + "from bokeh.plotting import figure\n", + "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n", + "from bokeh.transform import factor_cmap, factor_mark\n", + "from bokeh.palettes import Category10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n", + "\n", + "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n", + "\n", + "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_notebook()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should also adjust all the path constants to point at the relevant locations for you locally" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", + "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", + "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", + "\n", + "# My single speaker locations\n", + "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n", + "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n", + "\n", + "# My multi speaker locations\n", + "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n", + "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!ls -1 $MODEL_RUN_PATH" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CONFIG = load_config(CONFIG_PATH)\n", + "ap = AudioProcessor(**CONFIG['audio'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bring in the embeddings created by **compute_embeddings.py**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n", + "print(f'Embeddings found: {len(embed_files)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check that we did indeed find an embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embed_files[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process the speakers\n", + "\n", + "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n", + "speaker_to_utter = {}\n", + "for embed_file in embed_files:\n", + " speaker_path = os.path.dirname(os.path.dirname(embed_file))\n", + " try:\n", + " speaker_to_utter[speaker_path].append(embed_file)\n", + " except:\n", + " speaker_to_utter[speaker_path]=[embed_file]\n", + "print(f'Speaker count: {len(speaker_paths)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up the embeddings\n", + "\n", + "Adjust the number of speakers to select and the number of utterances from each speaker and they will be randomly sampled from the corpus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeds = []\n", + "labels = []\n", + "locations = []\n", + "\n", + "# single speaker \n", + "#num_speakers = 1\n", + "#num_utters = 1000\n", + "\n", + "# multi speaker\n", + "num_speakers = 10\n", + "num_utters = 20\n", + "\n", + "\n", + "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n", + "\n", + "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n", + " speaker_path = speaker_paths[speaker_idx]\n", + " speakers_utter = speaker_to_utter[speaker_path]\n", + " utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n", + " for utter_idx in utter_idxs:\n", + " embed_path = speaker_to_utter[speaker_path][utter_idx]\n", + " embed = np.load(embed_path)\n", + " embeds.append(embed)\n", + " labels.append(str(speaker_num))\n", + " locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n", + "embeds = np.concatenate(embeds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load embeddings with UMAP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = umap.UMAP()\n", + "projection = model.fit_transform(embeds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interactively charting the data in Bokeh\n", + "\n", + "Set up various details for Bokeh to plot the data\n", + "\n", + "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n", + "\n", + "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n", + "\n", + "File location in the tooltip is given relative to **AUDIO_PATH**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_wav_stems = ColumnDataSource(\n", + " data=dict(\n", + " x = projection.T[0].tolist(),\n", + " y = projection.T[1].tolist(),\n", + " desc=locations,\n", + " label=labels\n", + " )\n", + " )\n", + "\n", + "hover = HoverTool(\n", + " tooltips=[\n", + " (\"file\", \"@desc\"),\n", + " (\"speaker\", \"@label\"),\n", + " ]\n", + " )\n", + "\n", + "# optionally consider adding these to the tooltips if you want additional detail\n", + "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n", + "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n", + "\n", + "factors = list(set(labels))\n", + "pal_size = max(len(factors), 3)\n", + "pal = Category10[pal_size]\n", + "\n", + "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n", + "\n", + "\n", + "p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n", + "\n", + "url = \"http://localhost:8000/@desc\"\n", + "taptool = p.select(type=TapTool)\n", + "taptool.callback = OpenURL(url=url)\n", + "\n", + "show(p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Local server to serve wav files from corpus\n", + "\n", + "This is required so that when you click on a data point the hyperlink associated with it will be served the file locally.\n", + "\n", + "There are other ways to serve this if you prefer and you can also run the commands manually on the command line\n", + "\n", + "The server will continue to run until stopped. To stop it simply interupt the kernel (ie square button or under Kernel menu)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd $AUDIO_PATH\n", + "%pwd\n", + "!python -m http.server" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/run_tests.sh b/run_tests.sh index 46f18f01..5cd89564 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,7 +1,7 @@ TF_CPP_MIN_LOG_LEVEL=3 # tests -# nosetests tests -x &&\ +nosetests tests -x &&\ # runtime tests ./tests/test_server_package.sh && \ diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json new file mode 100644 index 00000000..c1bc33fd --- /dev/null +++ b/tests/inputs/test_glow_tts.json @@ -0,0 +1,134 @@ +{ + "model": "glow_tts", + "run_name": "glow-tts-gatedconv", + "run_description": "glow-tts model training with gated conv.", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Griffin-Lim + "power": 1.1, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + "add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model. + + // DISTRIBUTED TRAINING + "mixed_precision": false, + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54323" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // MODEL PARAMETERS + "use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments. + + // TRAINING + "batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":1, + "r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": true, // use noam warmup and lr schedule. + "grad_clip": 5.0, // upper limit for gradients for clipping. + "epochs": 1, // total number of epochs to train. + "lr": 1e-3, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + "encoder_type": "gatedconv", + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "apex_amp_level": null, + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 500, // DATASET-RELATED: maximum text length + "compute_f0": false, // compute f0 values in data-loader + + // PATHS + "output_path": "tests/train_outputs/", + + // PHONEMES + "phoneme_cache_path": "tests/outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": null, + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + + // DATASETS + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "tests/data/ljspeech/", + "meta_file_train": "metadata.csv", + "meta_file_val": "metadata.csv" + } + ] +} + + diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json new file mode 100644 index 00000000..f7da5980 --- /dev/null +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -0,0 +1,113 @@ +{ + "run_name": "wavegrad-ljspeech", + "run_description": "wavegrad ljspeech", + + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // DISTRIBUTED TRAINING + "mixed_precision": false, + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54322" + }, + + "target_loss": "avg_wavegrad_loss", // loss value to pick the best model to save after each epoch + + // MODEL PARAMETERS + "generator_model": "wavegrad", + "model_params":{ + "y_conv_channels":32, + "x_conv_channels":768, + "ublock_out_channels": [512, 512, 256, 128, 128], + "dblock_out_channels": [128, 128, 256, 512], + "upsample_factors": [4, 4, 4, 2, 2], + "upsample_dilations": [ + [1, 2, 1, 2], + [1, 2, 1, 2], + [1, 2, 4, 8], + [1, 2, 4, 8], + [1, 2, 4, 8]] + }, + + // DATASET + "data_path": "tests/data/ljspeech/wavs/", // root data path. It finds all wav files recursively from there. + "feature_path": null, // if you use precomputed features + "seq_len": 6144, // 24 * hop_length + "pad_short": 0, // additional padding for short wavs + "conv_pad": 0, // additional padding against convolutions applied to spectrograms + "use_noise_augment": false, // add noise to the audio signal for augmentation + "use_cache": true, // use in memory cache to keep the computed features. This might cause OOM. + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 1, // Batch size for training. + "train_noise_schedule":{ + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 1000 + }, + "test_noise_schedule":{ + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 2 + }, + + // VALIDATION + "run_eval": true, // enable/disable evaluation run + + // OPTIMIZER + "epochs": 1, // total number of epochs to train. + "clip_grad": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. + + // TENSORBOARD and LOGGING + "print_step": 250, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 10000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 4, + + // PATHS + "output_path": "tests/train_outputs/" +} + diff --git a/tests/test_glow-tts_train.sh b/tests/test_glow-tts_train.sh new file mode 100755 index 00000000..c8dd3e22 --- /dev/null +++ b/tests/test_glow-tts_train.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_glow_tts.py --config_path $BASEDIR/inputs/test_glow_tts.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_glow_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ diff --git a/tests/test_tts_train.sh b/tests/test_tts_train.sh new file mode 100755 index 00000000..ed0871eb --- /dev/null +++ b/tests/test_tts_train.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ diff --git a/tests/test_vocoder_wavegrad_train.sh b/tests/test_vocoder_wavegrad_train.sh new file mode 100755 index 00000000..b5e6e451 --- /dev/null +++ b/tests/test_vocoder_wavegrad_train.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# create run dir +mkdir $BASEDIR/train_outputs +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavegrad.py --config_path $BASEDIR/inputs/test_vocoder_wavegrad.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavegrad.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER \ No newline at end of file From 0605411c2ec17adb4a231c8925fedd4900eb2312 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 9 Nov 2020 17:57:33 +0100 Subject: [PATCH 74/98] update readme add latest model updates --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 472b504b..e3c24d3b 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ Speaker Encoder: Vocoders: - MelGAN: [paper](https://arxiv.org/abs/1710.10467) - MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106) +- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480) - GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646) - WaveRNN: [origin][https://github.com/fatchord/WaveRNN/] - WaveGrad: [paper][https://arxiv.org/abs/2009.00713] @@ -203,7 +204,7 @@ If you like to use TTS to try a new idea and like to share your experiments with - [x] Train TTS with r=1 successfully. - [x] Enable process based distributed training. Similar to (https://github.com/fastai/imagenet-fast/). - [x] Adapting Neural Vocoder. TTS works with WaveRNN and ParallelWaveGAN (https://github.com/erogol/WaveRNN and https://github.com/erogol/ParallelWaveGAN) -- [ ] Multi-speaker embedding. +- [x] Multi-speaker embedding. - [x] Model optimization (model export, model pruning etc.) 1\u001b[0;31m \u001b[0mspec\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwav\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Max:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Min:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Mean:\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mplot_spectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAP\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36mspectrogram\u001b[0;34m(self, y)\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0mD\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stft\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_amp_to_db\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mabs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mD\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 220\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_normalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 221\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmelspectrogram\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Projects/TTS/tts/utils/audio.py\u001b[0m in \u001b[0;36m_normalize\u001b[0;34m(self, S)\u001b[0m\n\u001b[1;32m 117\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinear_scaler\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mS\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' [!] Mean-Var stats does not match the given feature dimensions.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 120\u001b[0m \u001b[0;31m# range normalization\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref_level_db\u001b[0m \u001b[0;31m# discard certain range of DB assuming it is air noise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: [!] Mean-Var stats does not match the given feature dimensions." + ] + } + ], + "source": [ + "spec = AP.spectrogram(wav)\n", + "print(\"Max:\", spec.max())\n", + "print(\"Min:\", spec.min())\n", + "print(\"Mean:\", spec.mean())\n", + "plot_spectrogram(spec.T, AP);\n", + "\n", + "wav_gen = AP.inv_spectrogram(spec)\n", + "ipd.Audio(wav_gen, rate=AP.sample_rate)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, + "source": [ + "### Compare values for a certain parameter\n", + "\n", + "Optimize your parameters by comparing different values per parameter at a time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "audio={\n", + " 'audio_processor': 'audio',\n", + " 'num_mels': 80, # In general, you don'tneed to change it \n", + " 'num_freq': 1025, # In general, you don'tneed to change it \n", + " 'sample_rate': 22050, # It depends to the sample rate of the dataset.\n", + " 'frame_length_ms': 50, # In general, you don'tneed to change it \n", + " 'frame_shift_ms': 12.5, # In general, you don'tneed to change it \n", + " 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes traning harder. If your model does not train, try 0.97 - 0.99.\n", + " 'min_level_db': -100,\n", + " 'ref_level_db': 20, # It is the base DB, higher until you remove the background noise in the spectrogram and then lower until you hear a better speech below.\n", + " 'power': 1.5, # Change this value and listen the synthesized voice. 1.2 - 1.5 are some resonable values.\n", + " 'griffin_lim_iters': 60, # It does not give any imporvement for values > 60\n", + " 'signal_norm': True, # This is more about your model. It does not give any change for the synthsis performance.\n", + " 'symmetric_norm': False, # Same as above\n", + " 'max_norm': 1, # Same as above\n", + " 'clip_norm': True, # Same as above\n", + " 'mel_fmin': 0.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", + " 'mel_fmax': 8000.0, # You can play with this and check mel-spectrogram based voice synthesis below.\n", + " 'do_trim_silence': True} # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", + "\n", + "AP = AudioProcessor(**audio);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "from librosa import display\n", + "from matplotlib import pylab as plt\n", + "import IPython\n", + "plt.rcParams['figure.figsize'] = (20.0, 16.0)\n", + "\n", + "def compare_values(attribute, values, file):\n", + " \"\"\"\n", + " attributes (str): the names of the attribute you like to test.\n", + " values (list): list of values to compare.\n", + " file (str): file name to perform the tests.\n", + " \"\"\"\n", + " wavs = []\n", + " for idx, val in enumerate(values):\n", + " set_val_cmd = \"AP.{}={}\".format(attribute, val)\n", + " exec(set_val_cmd)\n", + " wav = AP.load_wav(file)\n", + " spec = AP.spectrogram(wav)\n", + " spec_norm = AP._denormalize(spec.T)\n", + " plt.subplot(len(values), 2, 2*idx + 1)\n", + " plt.imshow(spec_norm.T, aspect=\"auto\", origin=\"lower\")\n", + " # plt.colorbar()\n", + " plt.tight_layout()\n", + " wav_gen = AP.inv_spectrogram(spec)\n", + " wavs.append(wav_gen)\n", + " plt.subplot(len(values), 2, 2*idx + 2)\n", + " display.waveplot(wav, alpha=0.5)\n", + " display.waveplot(wav_gen, alpha=0.25)\n", + " plt.title(\"{}={}\".format(attribute, val))\n", + " plt.tight_layout()\n", + " \n", + " wav = AP.load_wav(file)\n", + " print(\" > Ground-truth\")\n", + " IPython.display.display(IPython.display.Audio(wav, rate=AP.sample_rate))\n", + " \n", + " for idx, wav_gen in enumerate(wavs):\n", + " val = values[idx]\n", + " print(\" > {} = {}\".format(attribute, val))\n", + " IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99], file_paths[10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], + "source": [ + "compare_values(\"ref_level_db\", [10, 15, 20, 25, 30, 35, 40], file_paths[10])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From aa2b31a1b0f327e3431ed5eabf5c857356d5c1e1 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 17 Nov 2020 14:22:01 +0100 Subject: [PATCH 90/98] use 'enabled' argument to control autocast --- TTS/bin/train_glow_tts.py | 2 +- TTS/bin/train_tacotron.py | 2 +- TTS/bin/train_vocoder_wavegrad.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 9358deb2..261267df 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -182,7 +182,7 @@ def train(model, criterion, optimizer, scheduler, optimizer.zero_grad() # forward pass model - with set_amp_context(c.mixed_precision): + with torch.cuda.amp.autocast(enabled=c.mixed_precision): z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( text_input, text_lengths, mel_input, mel_lengths, attn_mask, g=speaker_ids) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 09d40285..d999ddbe 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -158,7 +158,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, if optimizer_st: optimizer_st.zero_grad() - with set_amp_context(c.mixed_precision): + with torch.cuda.amp.autocast(enabled=c.mixed_precision): # forward pass model if c.bidirectional_decoder or c.double_decoder_consistency: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index 261be3fa..97eb0435 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -109,7 +109,7 @@ def train(model, criterion, optimizer, global_step += 1 - with set_amp_context(c.mixed_precision): + with torch.cuda.amp.autocast(enabled=c.mixed_precision): # compute noisy input if hasattr(model, 'module'): noise, x_noisy, noise_scale = model.module.compute_y_n(x) From 8a820930c6fd37005722527d8d415ffaf5d2b68e Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 25 Nov 2020 14:46:08 +0100 Subject: [PATCH 91/98] compute_embedding update --- TTS/bin/compute_embeddings.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 689f1c58..64edd140 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -9,7 +9,7 @@ import torch from TTS.speaker_encoder.model import SpeakerEncoder from TTS.utils.audio import AudioProcessor from TTS.utils.io import load_config -from TTS.utils.io import save_speaker_mapping +from TTS.tts.utils.speakers import save_speaker_mapping from TTS.tts.datasets.preprocess import load_meta_data parser = argparse.ArgumentParser( @@ -108,21 +108,23 @@ for idx, wav_file in enumerate(tqdm(wav_files)): if isinstance(wav_file, list): speaker_name = wav_file[2] wav_file = wav_file[1] + mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T mel_spec = torch.FloatTensor(mel_spec[None, :, :]) if args.use_cuda: mel_spec = mel_spec.cuda() embedd = model.compute_embedding(mel_spec) - np.save(output_files[idx], embedd.detach().cpu().numpy()) + embedd = embedd.detach().cpu().numpy() + np.save(output_files[idx], embedd) if args.target_dataset != '': # create speaker_mapping if target dataset is defined wav_file_name = os.path.basename(wav_file) speaker_mapping[wav_file_name] = {} speaker_mapping[wav_file_name]['name'] = speaker_name - speaker_mapping[wav_file_name]['embedding'] = embedd.detach().cpu().numpy() + speaker_mapping[wav_file_name]['embedding'] = embedd.flatten().tolist() if args.target_dataset != '': # save speaker_mapping if target dataset is defined mapping_file_path = os.path.join(args.output_path, 'speakers.json') - save_speaker_mapping(mapping_file_path, speaker_mapping) + save_speaker_mapping(args.output_path, speaker_mapping) From 1229554c427567f1b09318ddd3732ad463efebbe Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 25 Nov 2020 14:48:54 +0100 Subject: [PATCH 92/98] use native amp --- TTS/bin/train_glow_tts.py | 3 +-- TTS/bin/train_tacotron.py | 3 +-- TTS/bin/train_vocoder_wavegrad.py | 3 +-- TTS/tts/configs/config.json | 2 +- TTS/utils/generic_utils.py | 12 ------------ 5 files changed, 4 insertions(+), 19 deletions(-) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 261267df..fcf6c4cd 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -26,8 +26,7 @@ from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger from TTS.utils.generic_utils import (KeepAverage, count_parameters, create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict, - set_amp_context) + remove_experiment_folder, set_init_dict) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index d999ddbe..6c12e54b 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -28,8 +28,7 @@ from TTS.utils.distribute import (DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor) from TTS.utils.generic_utils import (KeepAverage, count_parameters, create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict, - set_amp_context) + remove_experiment_folder, set_init_dict) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index 97eb0435..9730d1c6 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -17,8 +17,7 @@ from TTS.utils.console_logger import ConsoleLogger from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import (KeepAverage, count_parameters, create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict, - set_amp_context) + remove_experiment_folder, set_init_dict) from TTS.utils.io import copy_config_file, load_config from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env diff --git a/TTS/tts/configs/config.json b/TTS/tts/configs/config.json index 2cad69c3..7274fd9d 100644 --- a/TTS/tts/configs/config.json +++ b/TTS/tts/configs/config.json @@ -65,7 +65,7 @@ "eval_batch_size":16, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. - "apex_amp_level": null, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. + "mixed_precision": true, // level of optimization with NVIDIA's apex feature for automatic mixed FP16/FP32 precision (AMP), NOTE: currently only O1 is supported, and use "O1" to activate. // LOSS SETTINGS "loss_masking": true, // enable / disable loss masking against the sequence padding. diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index bd99ecc2..20e11f41 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -4,22 +4,10 @@ import os import shutil import subprocess import contextlib -import platform import torch -def set_amp_context(mixed_precision): - if mixed_precision: - cm = torch.cuda.amp.autocast() - else: - # if platform.python_version() <= "3.6.0": - cm = contextlib.suppress() - # else: - # cm = contextlib.nullcontext() - return cm - - def get_git_branch(): try: out = subprocess.check_output(["git", "branch"]).decode("utf8") From d8c1b5b73d51504a60bb32953e22794756ac8c57 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 25 Nov 2020 14:49:07 +0100 Subject: [PATCH 93/98] print max lengths in tacotron training --- TTS/bin/train_tacotron.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index 6c12e54b..1263a616 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -86,8 +86,8 @@ def format_data(data, speaker_mapping=None): mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] - avg_text_length = torch.mean(text_lengths.float()) - avg_spec_length = torch.mean(mel_lengths.float()) + max_text_length = torch.max(text_lengths.float()) + max_spec_length = torch.max(mel_lengths.float()) if c.use_speaker_embedding: if c.use_external_speaker_embedding_file: @@ -123,7 +123,7 @@ def format_data(data, speaker_mapping=None): if speaker_embeddings is not None: speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) - return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length + return text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length def train(model, criterion, optimizer, optimizer_st, scheduler, @@ -144,7 +144,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, start_time = time.time() # format data - text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, avg_text_length, avg_spec_length = format_data(data, speaker_mapping) + text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, speaker_embeddings, max_text_length, max_spec_length = format_data(data, speaker_mapping) loader_time = time.time() - end_time global_step += 1 @@ -255,8 +255,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, # print training progress if global_step % c.print_step == 0: log_dict = { - "avg_spec_length": [avg_spec_length, 1], # value, precision - "avg_text_length": [avg_text_length, 1], + "max_spec_length": [max_spec_length, 1], # value, precision + "max_text_length": [max_text_length, 1], "step_time": [step_time, 4], "loader_time": [loader_time, 2], "current_lr": current_lr, From 4b92ac0f927ab696080492347268a72f35a654c4 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 25 Nov 2020 14:49:48 +0100 Subject: [PATCH 94/98] tune_wavegrad update --- TTS/bin/tune_wavegrad.py | 7 ++++--- TTS/vocoder/models/wavegrad.py | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py index fde521c5..7461282d 100644 --- a/TTS/bin/tune_wavegrad.py +++ b/TTS/bin/tune_wavegrad.py @@ -34,7 +34,7 @@ _, train_data = load_wav_data(args.data_path, 0) train_data = train_data[:args.num_samples] dataset = WaveGradDataset(ap=ap, items=train_data, - seq_len=ap.hop_length * 100, + seq_len=-1, hop_len=ap.hop_length, pad_short=config.pad_short, conv_pad=config.conv_pad, @@ -58,8 +58,9 @@ if args.use_cuda: model.cuda() # setup optimization parameters -base_values = sorted(np.random.uniform(high=10, size=args.search_depth)) -exponents = 10 ** np.linspace(-6, -2, num=args.num_iter) +base_values = sorted(10 * np.random.uniform(size=args.search_depth)) +print(base_values) +exponents = 10 ** np.linspace(-6, -1, num=args.num_iter) best_error = float('inf') best_schedule = None total_search_iter = len(base_values)**args.num_iter diff --git a/TTS/vocoder/models/wavegrad.py b/TTS/vocoder/models/wavegrad.py index f9bcdb85..18562d10 100644 --- a/TTS/vocoder/models/wavegrad.py +++ b/TTS/vocoder/models/wavegrad.py @@ -119,6 +119,7 @@ class Wavegrad(nn.Module): alpha = 1 - beta alpha_hat = np.cumprod(alpha) noise_level = np.concatenate([[1.0], alpha_hat ** 0.5], axis=0) + noise_level = alpha_hat ** 0.5 # pylint: disable=not-callable self.beta = torch.tensor(beta.astype(np.float32)) From 7541d2ecaaa7d159c23cd05b9097c8b2bb7c2769 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 25 Nov 2020 14:50:09 +0100 Subject: [PATCH 95/98] return eval split optional --- TTS/tts/datasets/preprocess.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 31d4b2b5..01a56adb 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -20,13 +20,13 @@ def load_meta_data(datasets, eval_split=True): preprocessor = get_preprocessor_by_name(name) meta_data_train = preprocessor(root_path, meta_file_train) print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") - if meta_file_val is None: - meta_data_eval, meta_data_train = split_dataset(meta_data_train) - else: - meta_data_eval = preprocessor(root_path, meta_file_val) - meta_data_train_all += meta_data_train - if meta_data_eval_all is not None: + if eval_split: + if meta_file_val is None: + meta_data_eval, meta_data_train = split_dataset(meta_data_train) + else: + meta_data_eval = preprocessor(root_path, meta_file_val) meta_data_eval_all += meta_data_eval + meta_data_train_all += meta_data_train return meta_data_train_all, meta_data_eval_all From a1e4ee18f9a47664cbc80e29021d5ed3cbac8a8b Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 25 Nov 2020 14:50:28 +0100 Subject: [PATCH 96/98] convert float16 to float32 for plotting spectrograms --- TTS/tts/utils/visual.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/TTS/tts/utils/visual.py b/TTS/tts/utils/visual.py index 033a5191..6eb3abdf 100644 --- a/TTS/tts/utils/visual.py +++ b/TTS/tts/utils/visual.py @@ -1,6 +1,8 @@ -import torch import librosa import matplotlib +import numpy as np +import torch + matplotlib.use('Agg') import matplotlib.pyplot as plt from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme @@ -43,6 +45,8 @@ def plot_spectrogram(spectrogram, spectrogram_ = spectrogram.detach().cpu().numpy().squeeze().T else: spectrogram_ = spectrogram.T + spectrogram_ = spectrogram_.astype( + np.float32) if spectrogram_.dtype == np.float16 else spectrogram_ if ap is not None: spectrogram_ = ap._denormalize(spectrogram_) # pylint: disable=protected-access fig = plt.figure(figsize=fig_size) From e3eda159d160796efcb4eb21ab2d77cd0e62c707 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 25 Nov 2020 14:50:50 +0100 Subject: [PATCH 97/98] wavegrad_dataset update --- TTS/vocoder/datasets/wavegrad_dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index 30cf9cb3..22f2af32 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -28,7 +28,7 @@ class WaveGradDataset(Dataset): self.ap = ap self.item_list = items - self.seq_len = seq_len + self.seq_len = seq_len if return_segments else None self.hop_len = hop_len self.pad_short = pad_short self.conv_pad = conv_pad @@ -38,7 +38,8 @@ class WaveGradDataset(Dataset): self.use_noise_augment = use_noise_augment self.verbose = verbose - assert seq_len % hop_len == 0, " [!] seq_len has to be a multiple of hop_len." + if return_segments: + assert seq_len % hop_len == 0, " [!] seq_len has to be a multiple of hop_len." self.feat_frame_len = seq_len // hop_len + (2 * conv_pad) # cache acoustic features From e3b7157146eccf92da7519d434d02b74cec90067 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 25 Nov 2020 15:22:01 +0100 Subject: [PATCH 98/98] remove contextlib --- TTS/utils/generic_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 20e11f41..7d7911b0 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -3,7 +3,6 @@ import glob import os import shutil import subprocess -import contextlib import torch