From 6378fa2b075bb9d37220350b7aca2cfb2f74d3b0 Mon Sep 17 00:00:00 2001 From: Alex K Date: Thu, 15 Oct 2020 19:14:50 +0200 Subject: [PATCH] add initial wavernn support --- TTS/bin/compute_statistics.py | 53 +- ...{train_vocoder.py => train_gan_vocoder.py} | 344 ++++++------ TTS/bin/train_wavernn_vocoder.py | 493 ++++++++++++++++++ TTS/vocoder/configs/wavernn_config.json | 95 ++++ TTS/vocoder/datasets/preprocess.py | 8 +- TTS/vocoder/utils/generic_utils.py | 20 + 6 files changed, 838 insertions(+), 175 deletions(-) rename TTS/bin/{train_vocoder.py => train_gan_vocoder.py} (68%) create mode 100644 TTS/bin/train_wavernn_vocoder.py create mode 100644 TTS/vocoder/configs/wavernn_config.json diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py index 1c6ef94d..9177c75b 100755 --- a/TTS/bin/compute_statistics.py +++ b/TTS/bin/compute_statistics.py @@ -11,20 +11,27 @@ from TTS.tts.datasets.preprocess import load_meta_data from TTS.utils.io import load_config from TTS.utils.audio import AudioProcessor + def main(): """Run preprocessing process.""" parser = argparse.ArgumentParser( - description="Compute mean and variance of spectrogtram features.") - parser.add_argument("--config_path", type=str, required=True, - help="TTS config file path to define audio processin parameters.") - parser.add_argument("--out_path", default=None, type=str, - help="directory to save the output file.") + description="Compute mean and variance of spectrogtram features." + ) + parser.add_argument( + "--config_path", + type=str, + required=True, + help="TTS config file path to define audio processin parameters.", + ) + parser.add_argument( + "--out_path", default=None, type=str, help="directory to save the output file." + ) args = parser.parse_args() # load config CONFIG = load_config(args.config_path) - CONFIG.audio['signal_norm'] = False # do not apply earlier normalization - CONFIG.audio['stats_path'] = None # discard pre-defined stats + CONFIG.audio["signal_norm"] = False # do not apply earlier normalization + CONFIG.audio["stats_path"] = None # discard pre-defined stats # load audio processor ap = AudioProcessor(**CONFIG.audio) @@ -58,27 +65,27 @@ def main(): output_file_path = os.path.join(args.out_path, "scale_stats.npy") stats = {} - stats['mel_mean'] = mel_mean - stats['mel_std'] = mel_scale - stats['linear_mean'] = linear_mean - stats['linear_std'] = linear_scale + stats["mel_mean"] = mel_mean + stats["mel_std"] = mel_scale + stats["linear_mean"] = linear_mean + stats["linear_std"] = linear_scale - print(f' > Avg mel spec mean: {mel_mean.mean()}') - print(f' > Avg mel spec scale: {mel_scale.mean()}') - print(f' > Avg linear spec mean: {linear_mean.mean()}') - print(f' > Avg lienar spec scale: {linear_scale.mean()}') + print(f" > Avg mel spec mean: {mel_mean.mean()}") + print(f" > Avg mel spec scale: {mel_scale.mean()}") + print(f" > Avg linear spec mean: {linear_mean.mean()}") + print(f" > Avg lienar spec scale: {linear_scale.mean()}") # set default config values for mean-var scaling - CONFIG.audio['stats_path'] = output_file_path - CONFIG.audio['signal_norm'] = True + CONFIG.audio["stats_path"] = output_file_path + CONFIG.audio["signal_norm"] = True # remove redundant values - del CONFIG.audio['max_norm'] - del CONFIG.audio['min_level_db'] - del CONFIG.audio['symmetric_norm'] - del CONFIG.audio['clip_norm'] - stats['audio_config'] = CONFIG.audio + del CONFIG.audio["max_norm"] + del CONFIG.audio["min_level_db"] + del CONFIG.audio["symmetric_norm"] + del CONFIG.audio["clip_norm"] + stats["audio_config"] = CONFIG.audio np.save(output_file_path, stats, allow_pickle=True) - print(f' > scale_stats.npy is saved to {output_file_path}') + print(f" > scale_stats.npy is saved to {output_file_path}") if __name__ == "__main__": diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_gan_vocoder.py similarity index 68% rename from TTS/bin/train_vocoder.py rename to TTS/bin/train_gan_vocoder.py index b51a55a3..7689c930 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_gan_vocoder.py @@ -10,20 +10,29 @@ import torch from torch.utils.data import DataLoader from TTS.utils.audio import AudioProcessor from TTS.utils.console_logger import ConsoleLogger -from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, get_git_branch, - remove_experiment_folder, set_init_dict) +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, +) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import setup_torch_training_env from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data + # from distribute import (DistributedSampler, apply_gradient_allreduce, # init_distributed, reduce_tensor) from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss -from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, - setup_generator) +from TTS.vocoder.utils.generic_utils import ( + plot_results, + setup_discriminator, + setup_generator, +) from TTS.vocoder.utils.io import save_best_model, save_checkpoint use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -33,27 +42,30 @@ def setup_loader(ap, is_val=False, verbose=False): if is_val and not c.run_eval: loader = None else: - dataset = GANDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=not is_val, - use_noise_augment=c.use_noise_augment, - use_cache=c.use_cache, - verbose=verbose) + dataset = GANDataset( + ap=ap, + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=not is_val, + use_noise_augment=c.use_noise_augment, + use_cache=c.use_cache, + verbose=verbose, + ) dataset.shuffle_mapping() # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader(dataset, - batch_size=1 if is_val else c.batch_size, - shuffle=True, - drop_last=False, - sampler=None, - num_workers=c.num_val_loader_workers - if is_val else c.num_loader_workers, - pin_memory=False) + loader = DataLoader( + dataset, + batch_size=1 if is_val else c.batch_size, + shuffle=True, + drop_last=False, + sampler=None, + num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers, + pin_memory=False, + ) return loader @@ -80,16 +92,26 @@ def format_data(data): return co, x, None, None -def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, - scheduler_G, scheduler_D, ap, global_step, epoch): +def train( + model_G, + criterion_G, + optimizer_G, + model_D, + criterion_D, + optimizer_D, + scheduler_G, + scheduler_D, + ap, + global_step, + epoch, +): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model_G.train() model_D.train() epoch_time = 0 keep_avg = KeepAverage() if use_cuda: - batch_n_iter = int( - len(data_loader.dataset) / (c.batch_size * num_gpus)) + batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() @@ -145,16 +167,16 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, scores_fake = D_out_fake # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) - loss_G = loss_G_dict['G_loss'] + loss_G_dict = criterion_G( + y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub + ) + loss_G = loss_G_dict["G_loss"] # optimizer generator optimizer_G.zero_grad() loss_G.backward() if c.gen_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_G.parameters(), - c.gen_clip_grad) + torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad) optimizer_G.step() if scheduler_G is not None: scheduler_G.step() @@ -199,14 +221,13 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute losses loss_D_dict = criterion_D(scores_fake, scores_real) - loss_D = loss_D_dict['D_loss'] + loss_D = loss_D_dict["D_loss"] # optimizer discriminator optimizer_D.zero_grad() loss_D.backward() if c.disc_clip_grad > 0: - torch.nn.utils.clip_grad_norm_(model_D.parameters(), - c.disc_clip_grad) + torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad) optimizer_D.step() if scheduler_D is not None: scheduler_D.step() @@ -221,34 +242,40 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, epoch_time += step_time # get current learning rates - current_lr_G = list(optimizer_G.param_groups)[0]['lr'] - current_lr_D = list(optimizer_D.param_groups)[0]['lr'] + current_lr_G = list(optimizer_G.param_groups)[0]["lr"] + current_lr_D = list(optimizer_D.param_groups)[0]["lr"] # update avg stats update_train_values = dict() for key, value in loss_dict.items(): - update_train_values['avg_' + key] = value - update_train_values['avg_loader_time'] = loader_time - update_train_values['avg_step_time'] = step_time + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time keep_avg.update_values(update_train_values) # print training stats if global_step % c.print_step == 0: log_dict = { - 'step_time': [step_time, 2], - 'loader_time': [loader_time, 4], + "step_time": [step_time, 2], + "loader_time": [loader_time, 4], "current_lr_G": current_lr_G, - "current_lr_D": current_lr_D + "current_lr_D": current_lr_D, } - c_logger.print_train_step(batch_n_iter, num_iter, global_step, - log_dict, loss_dict, keep_avg.avg_values) + c_logger.print_train_step( + batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) # plot step stats if global_step % 10 == 0: iter_stats = { "lr_G": current_lr_G, "lr_D": current_lr_D, - "step_time": step_time + "step_time": step_time, } iter_stats.update(loss_dict) tb_logger.tb_train_iter_stats(global_step, iter_stats) @@ -257,27 +284,28 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model_G, - optimizer_G, - scheduler_G, - model_D, - optimizer_D, - scheduler_D, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict) + save_checkpoint( + model_G, + optimizer_G, + scheduler_G, + model_D, + optimizer_D, + scheduler_D, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) # compute spectrograms - figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + figures = plot_results(y_hat_vis, y_G, ap, global_step, "train") tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + tb_logger.tb_train_audios( + global_step, {"train/audio": sample_voice}, c.audio["sample_rate"] + ) end_time = time.time() # print epoch stats @@ -326,7 +354,6 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) y_hat = model_G.pqmf_synthesis(y_hat) y_G_sub = model_G.pqmf_analysis(y_G) - scores_fake, feats_fake, feats_real = None, None, None if global_step > c.steps_to_start_discriminator: @@ -352,8 +379,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) feats_fake, feats_real = None, None # compute losses - loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, - feats_real, y_hat_sub, y_G_sub) + loss_G_dict = criterion_G( + y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub + ) loss_dict = dict() for key, value in loss_G_dict.items(): @@ -403,16 +431,15 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) else: loss_dict[key] = value.item() - step_time = time.time() - start_time epoch_time += step_time # update avg stats update_eval_values = dict() for key, value in loss_dict.items(): - update_eval_values['avg_' + key] = value - update_eval_values['avg_loader_time'] = loader_time - update_eval_values['avg_step_time'] = step_time + update_eval_values["avg_" + key] = value + update_eval_values["avg_loader_time"] = loader_time + update_eval_values["avg_step_time"] = step_time keep_avg.update_values(update_eval_values) # print eval stats @@ -420,13 +447,14 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) # compute spectrograms - figures = plot_results(y_hat, y_G, ap, global_step, 'eval') + figures = plot_results(y_hat, y_G, ap, global_step, "eval") tb_logger.tb_eval_figures(global_step, figures) # Sample audio sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() - tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, - c.audio["sample_rate"]) + tb_logger.tb_eval_audios( + global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"] + ) # synthesize a full voice data_loader.return_segments = False @@ -443,7 +471,9 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + eval_data, train_data = load_wav_feat_data( + c.data_path, c.feature_path, c.eval_split_size + ) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -461,17 +491,15 @@ def main(args): # pylint: disable=redefined-outer-name # setup optimizers optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0) - optimizer_disc = RAdam(model_disc.parameters(), - lr=c.lr_disc, - weight_decay=0) + optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0) # schedulers scheduler_gen = None scheduler_disc = None - if 'lr_scheduler_gen' in c: + if "lr_scheduler_gen" in c: scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) - if 'lr_scheduler_disc' in c: + if "lr_scheduler_disc" in c: scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) @@ -480,47 +508,46 @@ def main(args): # pylint: disable=redefined-outer-name criterion_disc = DiscriminatorLoss(c) if args.restore_path: - checkpoint = torch.load(args.restore_path, map_location='cpu') + checkpoint = torch.load(args.restore_path, map_location="cpu") try: print(" > Restoring Generator Model...") - model_gen.load_state_dict(checkpoint['model']) + model_gen.load_state_dict(checkpoint["model"]) print(" > Restoring Generator Optimizer...") - optimizer_gen.load_state_dict(checkpoint['optimizer']) + optimizer_gen.load_state_dict(checkpoint["optimizer"]) print(" > Restoring Discriminator Model...") - model_disc.load_state_dict(checkpoint['model_disc']) + model_disc.load_state_dict(checkpoint["model_disc"]) print(" > Restoring Discriminator Optimizer...") - optimizer_disc.load_state_dict(checkpoint['optimizer_disc']) - if 'scheduler' in checkpoint: + optimizer_disc.load_state_dict(checkpoint["optimizer_disc"]) + if "scheduler" in checkpoint: print(" > Restoring Generator LR Scheduler...") - scheduler_gen.load_state_dict(checkpoint['scheduler']) + scheduler_gen.load_state_dict(checkpoint["scheduler"]) # NOTE: Not sure if necessary scheduler_gen.optimizer = optimizer_gen - if 'scheduler_disc' in checkpoint: + if "scheduler_disc" in checkpoint: print(" > Restoring Discriminator LR Scheduler...") - scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) + scheduler_disc.load_state_dict(checkpoint["scheduler_disc"]) scheduler_disc.optimizer = optimizer_disc except RuntimeError: # retore only matching layers. print(" > Partial model initialization...") model_dict = model_gen.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model'], c) + model_dict = set_init_dict(model_dict, checkpoint["model"], c) model_gen.load_state_dict(model_dict) model_dict = model_disc.state_dict() - model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c) + model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c) model_disc.load_state_dict(model_dict) del model_dict # reset lr if not countinuining training. for group in optimizer_gen.param_groups: - group['lr'] = c.lr_gen + group["lr"] = c.lr_gen for group in optimizer_disc.param_groups: - group['lr'] = c.lr_disc + group["lr"] = c.lr_disc - print(" > Model restored from step %d" % checkpoint['step'], - flush=True) - args.restore_step = checkpoint['step'] + print(" > Model restored from step %d" % checkpoint["step"], flush=True) + args.restore_step = checkpoint["step"] else: args.restore_step = 0 @@ -539,75 +566,92 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model_disc) print(" > Discriminator has {} parameters".format(num_params), flush=True) - if 'best_loss' not in locals(): - best_loss = float('inf') + if "best_loss" not in locals(): + best_loss = float("inf") global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model_gen, criterion_gen, optimizer_gen, - model_disc, criterion_disc, optimizer_disc, - scheduler_gen, scheduler_disc, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, - global_step, epoch) + _, global_step = train( + model_gen, + criterion_gen, + optimizer_gen, + model_disc, + criterion_disc, + optimizer_disc, + scheduler_gen, + scheduler_disc, + ap, + global_step, + epoch, + ) + eval_avg_loss_dict = evaluate( + model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch + ) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model_gen, - optimizer_gen, - scheduler_gen, - model_disc, - optimizer_disc, - scheduler_disc, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict) + best_loss = save_best_model( + target_loss, + best_loss, + model_gen, + optimizer_gen, + scheduler_gen, + model_disc, + optimizer_disc, + scheduler_disc, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + ) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - '--continue_path', + "--continue_path", type=str, - help= - 'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', - default='', - required='--config_path' not in sys.argv) + help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', + default="", + required="--config_path" not in sys.argv, + ) parser.add_argument( - '--restore_path', + "--restore_path", type=str, - help='Model file to be restored. Use to finetune a model.', - default='') - parser.add_argument('--config_path', - type=str, - help='Path to config file for training.', - required='--continue_path' not in sys.argv) - parser.add_argument('--debug', - type=bool, - default=False, - help='Do not verify commit integrity to run training.') + help="Model file to be restored. Use to finetune a model.", + default="", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in sys.argv, + ) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.", + ) # DISTRUBUTED parser.add_argument( - '--rank', + "--rank", type=int, default=0, - help='DISTRIBUTED: process rank for distributed training.') - parser.add_argument('--group_id', - type=str, - default="", - help='DISTRIBUTED: process group id.') + help="DISTRIBUTED: process rank for distributed training.", + ) + parser.add_argument( + "--group_id", type=str, default="", help="DISTRIBUTED: process group id." + ) args = parser.parse_args() - if args.continue_path != '': + if args.continue_path != "": args.output_path = args.continue_path - args.config_path = os.path.join(args.continue_path, 'config.json') + args.config_path = os.path.join(args.continue_path, "config.json") list_of_files = glob.glob( - args.continue_path + - "/*.pth.tar") # * means all if need specific format then *.csv + args.continue_path + "/*.pth.tar" + ) # * means all if need specific format then *.csv latest_model_file = max(list_of_files, key=os.path.getctime) args.restore_path = latest_model_file print(f" > Training continues for {args.restore_path}") @@ -618,11 +662,10 @@ if __name__ == '__main__': _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = args.continue_path - if args.continue_path == '': - OUT_PATH = create_experiment_folder(c.output_path, c.run_name, - args.debug) + if args.continue_path == "": + OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug) - AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') + AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") c_logger = ConsoleLogger() @@ -632,16 +675,17 @@ if __name__ == '__main__': if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() - copy_config_file(args.config_path, - os.path.join(OUT_PATH, 'config.json'), new_fields) + copy_config_file( + args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + ) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) LOG_DIR = OUT_PATH - tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') + tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") # write model desc to tensorboard - tb_logger.tb_add_text('model-description', c['run_description'], 0) + tb_logger.tb_add_text("model-description", c["run_description"], 0) try: main(args) @@ -654,4 +698,4 @@ if __name__ == '__main__': except Exception: # pylint: disable=broad-except remove_experiment_folder(OUT_PATH) traceback.print_exc() - sys.exit(1) + sys.exit(1) \ No newline at end of file diff --git a/TTS/bin/train_wavernn_vocoder.py b/TTS/bin/train_wavernn_vocoder.py new file mode 100644 index 00000000..2f77ab57 --- /dev/null +++ b/TTS/bin/train_wavernn_vocoder.py @@ -0,0 +1,493 @@ +import argparse +import math +import os +import pickle +import shutil +import sys +import traceback +import time +import glob +import random + +import torch +from torch.utils.data import DataLoader +from torch.utils.data.distributed import DistributedSampler + + +from TTS.utils.audio import AudioProcessor +from TTS.tts.utils.visual import plot_spectrogram +from TTS.utils.io import copy_config_file, load_config +from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset +from TTS.utils.tensorboard_logger import TensorboardLogger +from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data +from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss +from TTS.vocoder.utils.generic_utils import setup_wavernn +from TTS.utils.training import setup_torch_training_env +from TTS.utils.console_logger import ConsoleLogger +from TTS.utils.generic_utils import ( + KeepAverage, + count_parameters, + create_experiment_folder, + get_git_branch, + remove_experiment_folder, + set_init_dict, +) +from TTS.vocoder.utils.io import save_best_model, save_checkpoint + + +use_cuda, num_gpus = setup_torch_training_env(True, True) + + +def setup_loader(ap, is_val=False, verbose=False): + if is_val and not CONFIG.run_eval: + loader = None + else: + dataset = WaveRNNDataset( + ap=ap, + items=eval_data if is_val else train_data, + seq_len=CONFIG.seq_len, + hop_len=ap.hop_length, + pad=CONFIG.padding, + mode=CONFIG.mode, + is_training=not is_val, + verbose=verbose, + ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None + loader = DataLoader( + dataset, + shuffle=True, + collate_fn=dataset.collate, + batch_size=CONFIG.batch_size, + num_workers=CONFIG.num_val_loader_workers + if is_val + else CONFIG.num_loader_workers, + pin_memory=True, + ) + return loader + + +def format_data(data): + # setup input data + x = data[0] + m = data[1] + y = data[2] + + # dispatch data to GPU + if use_cuda: + x = x.cuda(non_blocking=True) + m = m.cuda(non_blocking=True) + y = y.cuda(non_blocking=True) + + return x, m, y + + +def train(model, optimizer, criterion, scheduler, ap, global_step, epoch): + # create train loader + data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) + model.train() + epoch_time = 0 + keep_avg = KeepAverage() + if use_cuda: + batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus)) + else: + batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size) + end_time = time.time() + c_logger.print_train_start() + # train loop + print(" > Training", flush=True) + for num_iter, data in enumerate(data_loader): + start_time = time.time() + x, m, y = format_data(data) + loader_time = time.time() - end_time + global_step += 1 + + ################## + # MODEL TRAINING # + ################## + y_hat = model(x, m) + y_hat_vis = y_hat # for visualization + + # y_hat = y_hat.transpose(1, 2) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y = y.float() + y = y.unsqueeze(-1) + # m_scaled, _ = model.upsample(m) + + # compute losses + loss = criterion(y_hat, y) + if loss.item() is None: + raise RuntimeError(" [!] None loss. Exiting ...") + optimizer.zero_grad() + loss.backward() + if CONFIG.grad_clip > 0: + torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip) + + optimizer.step() + if scheduler is not None: + scheduler.step() + + # get the current learning rate + cur_lr = list(optimizer.param_groups)[0]["lr"] + + step_time = time.time() - start_time + epoch_time += step_time + + update_train_values = dict() + loss_dict = dict() + loss_dict["model_loss"] = loss.item() + for key, value in loss_dict.items(): + update_train_values["avg_" + key] = value + update_train_values["avg_loader_time"] = loader_time + update_train_values["avg_step_time"] = step_time + keep_avg.update_values(update_train_values) + + # print training stats + if global_step % CONFIG.print_step == 0: + log_dict = { + "step_time": [step_time, 2], + "loader_time": [loader_time, 4], + "current_lr": cur_lr, + } + c_logger.print_train_step( + batch_n_iter, + num_iter, + global_step, + log_dict, + loss_dict, + keep_avg.avg_values, + ) + + # plot step stats + if global_step % 10 == 0: + iter_stats = {"lr": cur_lr, "step_time": step_time} + iter_stats.update(loss_dict) + tb_logger.tb_train_iter_stats(global_step, iter_stats) + + # save checkpoint + if global_step % CONFIG.save_step == 0: + if CONFIG.checkpoint: + # save model + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + ) + + # synthesize a full voice + wav_path = train_data[random.randrange(0, len(train_data))][0] + wav = ap.load_wav(wav_path) + ground_mel = ap.melspectrogram(wav) + sample_wav = model.generate( + ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) + predict_mel = ap.melspectrogram(sample_wav) + + # Sample audio + tb_logger.tb_train_audios( + global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + # compute spectrograms + figures = { + "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), + "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + } + tb_logger.tb_train_figures(global_step, figures) + end_time = time.time() + + # print epoch stats + c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) + + # Plot Training Epoch Stats + epoch_stats = {"epoch_time": epoch_time} + epoch_stats.update(keep_avg.avg_values) + tb_logger.tb_train_epoch_stats(global_step, epoch_stats) + # TODO: plot model stats + # if c.tb_model_param_stats: + # tb_logger.tb_model_weights(model, global_step) + return keep_avg.avg_values, global_step + + +@torch.no_grad() +def evaluate(model, criterion, ap, global_step, epoch): + # create train loader + data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0)) + model.eval() + epoch_time = 0 + keep_avg = KeepAverage() + end_time = time.time() + c_logger.print_eval_start() + with torch.no_grad(): + for num_iter, data in enumerate(data_loader): + start_time = time.time() + # format data + x, m, y = format_data(data) + loader_time = time.time() - end_time + global_step += 1 + + y_hat = model(x, m) + if isinstance(model.mode, int): + y_hat = y_hat.transpose(1, 2).unsqueeze(-1) + else: + y = y.float() + y = y.unsqueeze(-1) + loss = criterion(y_hat, y) + # Compute avg loss + # if num_gpus > 1: + # loss = reduce_tensor(loss.data, num_gpus) + loss_dict = dict() + loss_dict["model_loss"] = loss.item() + + step_time = time.time() - start_time + epoch_time += step_time + + # update avg stats + update_eval_values = dict() + for key, value in loss_dict.items(): + update_eval_values["avg_" + key] = value + update_eval_values["avg_loader_time"] = loader_time + update_eval_values["avg_step_time"] = step_time + keep_avg.update_values(update_eval_values) + + # print eval stats + if CONFIG.print_eval: + c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) + + if epoch > CONFIG.test_delay_epochs: + # synthesize a full voice + wav_path = eval_data[random.randrange(0, len(eval_data))][0] + wav = ap.load_wav(wav_path) + ground_mel = ap.melspectrogram(wav) + sample_wav = model.generate( + ground_mel, + CONFIG.batched, + CONFIG.target_samples, + CONFIG.overlap_samples, + ) + predict_mel = ap.melspectrogram(sample_wav) + + # Sample audio + tb_logger.tb_eval_audios( + global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"] + ) + # compute spectrograms + figures = { + "prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False), + "ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False), + } + tb_logger.tb_eval_figures(global_step, figures) + + tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) + return keep_avg.avg_values + + +# FIXME: move args definition/parsing inside of main? +def main(args): # pylint: disable=redefined-outer-name + # pylint: disable=global-variable-undefined + global train_data, eval_data + + print(f" > Loading wavs from: {CONFIG.data_path}") + if CONFIG.feature_path is not None: + print(f" > Loading features from: {CONFIG.feature_path}") + eval_data, train_data = load_wav_feat_data( + CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size + ) + eval_data, train_data = eval_data, train_data + else: + eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size) + + # setup audio processor + ap = AudioProcessor(**CONFIG.audio) + + # setup model + model_wavernn = setup_wavernn(CONFIG) + + # define train functions + if CONFIG.mode == "mold": + criterion = discretized_mix_logistic_loss + elif CONFIG.mode == "gauss": + criterion = gaussian_loss + elif isinstance(CONFIG.mode, int): + criterion = torch.nn.CrossEntropyLoss() + + if use_cuda: + model_wavernn.cuda() + if isinstance(CONFIG.mode, int): + criterion.cuda() + + optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0) + scheduler = None + if "lr_scheduler" in CONFIG: + scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler) + scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params) + # slow start for the first 5 epochs + # lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1) + # scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) + + # restore any checkpoint + if args.restore_path: + checkpoint = torch.load(args.restore_path, map_location="cpu") + try: + print(" > Restoring Model...") + model_wavernn.load_state_dict(checkpoint["model"]) + print(" > Restoring Optimizer...") + optimizer.load_state_dict(checkpoint["optimizer"]) + if "scheduler" in checkpoint: + print(" > Restoring Generator LR Scheduler...") + scheduler.load_state_dict(checkpoint["scheduler"]) + scheduler.optimizer = optimizer + # TODO: fix resetting restored optimizer lr + # optimizer.load_state_dict(checkpoint["optimizer"]) + except RuntimeError: + # retore only matching layers. + print(" > Partial model initialization...") + model_dict = model_wavernn.state_dict() + model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG) + model_wavernn.load_state_dict(model_dict) + + print(" > Model restored from step %d" % checkpoint["step"], flush=True) + args.restore_step = checkpoint["step"] + else: + args.restore_step = 0 + + # DISTRIBUTED + # if num_gpus > 1: + # model = apply_gradient_allreduce(model) + + num_parameters = count_parameters(model_wavernn) + print(" > Model has {} parameters".format(num_parameters), flush=True) + + if "best_loss" not in locals(): + best_loss = float("inf") + + global_step = args.restore_step + for epoch in range(0, CONFIG.epochs): + c_logger.print_epoch_start(epoch, CONFIG.epochs) + _, global_step = train( + model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch + ) + eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch) + c_logger.print_epoch_end(epoch, eval_avg_loss_dict) + target_loss = eval_avg_loss_dict["avg_model_loss"] + best_loss = save_best_model( + target_loss, + best_loss, + model_wavernn, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--continue_path", + type=str, + help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', + default="", + required="--config_path" not in sys.argv, + ) + parser.add_argument( + "--restore_path", + type=str, + help="Model file to be restored. Use to finetune a model.", + default="", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to config file for training.", + required="--continue_path" not in sys.argv, + ) + parser.add_argument( + "--debug", + type=bool, + default=False, + help="Do not verify commit integrity to run training.", + ) + + # DISTRUBUTED + parser.add_argument( + "--rank", + type=int, + default=0, + help="DISTRIBUTED: process rank for distributed training.", + ) + parser.add_argument( + "--group_id", type=str, default="", help="DISTRIBUTED: process group id." + ) + args = parser.parse_args() + + if args.continue_path != "": + args.output_path = args.continue_path + args.config_path = os.path.join(args.continue_path, "config.json") + list_of_files = glob.glob( + args.continue_path + "/*.pth.tar" + ) # * means all if need specific format then *.csv + latest_model_file = max(list_of_files, key=os.path.getctime) + args.restore_path = latest_model_file + print(f" > Training continues for {args.restore_path}") + + # setup output paths and read configs + CONFIG = load_config(args.config_path) + # check_config(c) + _ = os.path.dirname(os.path.realpath(__file__)) + + OUT_PATH = args.continue_path + if args.continue_path == "": + OUT_PATH = create_experiment_folder( + CONFIG.output_path, CONFIG.run_name, args.debug + ) + + AUDIO_PATH = os.path.join(OUT_PATH, "test_audios") + + c_logger = ConsoleLogger() + + if args.rank == 0: + os.makedirs(AUDIO_PATH, exist_ok=True) + new_fields = {} + if args.restore_path: + new_fields["restore_path"] = args.restore_path + new_fields["github_branch"] = get_git_branch() + copy_config_file( + args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields + ) + os.chmod(AUDIO_PATH, 0o775) + os.chmod(OUT_PATH, 0o775) + + LOG_DIR = OUT_PATH + tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER") + + # write model desc to tensorboard + tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0) + + try: + main(args) + except KeyboardInterrupt: + remove_experiment_folder(OUT_PATH) + try: + sys.exit(0) + except SystemExit: + os._exit(0) # pylint: disable=protected-access + except Exception: # pylint: disable=broad-except + remove_experiment_folder(OUT_PATH) + traceback.print_exc() + sys.exit(1) diff --git a/TTS/vocoder/configs/wavernn_config.json b/TTS/vocoder/configs/wavernn_config.json new file mode 100644 index 00000000..f7e5d99f --- /dev/null +++ b/TTS/vocoder/configs/wavernn_config.json @@ -0,0 +1,95 @@ +{ + "model": "wavernn", + "run_name": "wavernn_test", + "run_description": "wavernn_test training", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 20.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // Generating / Synthesizing + "batched": true, + "target_samples": 11000, // target number of samples to be generated in each batch entry + "overlap_samples": 550, // number of samples for crossfading between batches + + // DISTRIBUTED TRAINING + // "distributed":{ + // "backend": "nccl", + // "url": "tcp:\/\/localhost:54321" + // }, + + // MODEL PARAMETERS + "use_aux_net": true, + "use_upsample_net": true, + "upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length + "seq_len": 1280, // has to be devideable by hop_length + "mode": "mold", // mold [string], gauss [string], bits [int] + "mulaw": false, // apply mulaw if mode is bits + "padding": 2, // pad the input for resnet to see wider input length + + // DATASET + "data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files + "feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant) + + // TRAINING + "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "epochs": 10000, // total number of epochs to train. + "warmup_steps": 10, + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 10, // early testing only wastes computation time. + + // OPTIMIZER + "grad_clip": 4, // apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [200000, 400000, 600000] + }, + "lr": 1e-4, // initial learning rate + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 50, // number of samples for testing + + // PATHS + "output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/" +} + diff --git a/TTS/vocoder/datasets/preprocess.py b/TTS/vocoder/datasets/preprocess.py index be60c13a..a5365686 100644 --- a/TTS/vocoder/datasets/preprocess.py +++ b/TTS/vocoder/datasets/preprocess.py @@ -23,8 +23,12 @@ def load_wav_data(data_path, eval_split_size): def load_wav_feat_data(data_path, feat_path, eval_split_size): - wav_paths = sorted(find_wav_files(data_path)) - feat_paths = sorted(find_feat_files(feat_path)) + wav_paths = find_wav_files(data_path) + feat_paths = find_feat_files(feat_path) + + wav_paths.sort(key=lambda x: Path(x).stem) + feat_paths.sort(key=lambda x: Path(x).stem) + assert len(wav_paths) == len(feat_paths) for wav, feat in zip(wav_paths, feat_paths): wav_name = Path(wav).stem diff --git a/TTS/vocoder/utils/generic_utils.py b/TTS/vocoder/utils/generic_utils.py index 89dc68fb..365d0e11 100644 --- a/TTS/vocoder/utils/generic_utils.py +++ b/TTS/vocoder/utils/generic_utils.py @@ -41,6 +41,26 @@ def to_camel(text): text = text.capitalize() return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) +def setup_wavernn(c): + print(" > Model: {}".format(c.model)) + MyModel = importlib.import_module('TTS.vocoder.models.wavernn') + MyModel = getattr(MyModel, "WaveRNN") + model = MyModel( + rnn_dims=512, + fc_dims=512, + mode=c.mode, + mulaw=c.mulaw, + pad=c.padding, + use_aux_net=c.use_aux_net, + use_upsample_net=c.use_upsample_net, + upsample_factors=c.upsample_factors, + feat_dims=80, + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=c.audio['hop_length'], + sample_rate=c.audio['sample_rate']) + return model def setup_generator(c): print(" > Generator Model: {}".format(c.generator_model))