diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 407616ec..3c211496 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -116,7 +116,7 @@ def format_data(data): avg_text_length, avg_spec_length, attn_mask, item_idx -def data_depended_init(data_loader, model, ap): +def data_depended_init(data_loader, model): """Data depended initialization for activation normalization.""" if hasattr(model, 'module'): for f in model.module.decoder.flows: @@ -135,7 +135,7 @@ def data_depended_init(data_loader, model, ap): # format data text_input, text_lengths, mel_input, mel_lengths, spekaer_embed,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model _ = model.forward( @@ -174,7 +174,7 @@ def train(data_loader, model, criterion, optimizer, scheduler, # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - avg_text_length, avg_spec_length, attn_mask, item_idx = format_data(data) + avg_text_length, avg_spec_length, attn_mask, _ = format_data(data) loader_time = time.time() - end_time @@ -188,20 +188,20 @@ def train(data_loader, model, criterion, optimizer, scheduler, # compute loss loss_dict = criterion(z, y_mean, y_log_scale, logdet, mel_lengths, - o_dur_log, o_total_dur, text_lengths) + o_dur_log, o_total_dur, text_lengths) # backward pass with loss scaling if c.mixed_precision: scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -329,7 +329,7 @@ def evaluate(data_loader, model, criterion, ap, global_step, epoch): # format data text_input, text_lengths, mel_input, mel_lengths, speaker_c,\ - _, _, attn_mask, item_idx = format_data(data) + _, _, attn_mask, _ = format_data(data) # forward pass model z, logdet, y_mean, y_log_scale, alignments, o_dur_log, o_total_dur = model.forward( @@ -546,13 +546,14 @@ def main(args): # pylint: disable=redefined-outer-name eval_loader = setup_loader(ap, 1, is_val=True, verbose=True) global_step = args.restore_step - model = data_depended_init(train_loader, model, ap) + model = data_depended_init(train_loader, model) for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: diff --git a/TTS/bin/train_speedy_speech.py b/TTS/bin/train_speedy_speech.py index bee37b05..7d7d834c 100644 --- a/TTS/bin/train_speedy_speech.py +++ b/TTS/bin/train_speedy_speech.py @@ -172,13 +172,13 @@ def train(data_loader, model, criterion, optimizer, scheduler, scaler.scale(loss_dict['loss']).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) scaler.step(optimizer) scaler.update() else: loss_dict['loss'].backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.grad_clip) + c.grad_clip) optimizer.step() # setup lr @@ -515,12 +515,14 @@ def main(args): # pylint: disable=redefined-outer-name train_avg_loss_dict, global_step = train(train_loader, model, criterion, optimizer, scheduler, ap, global_step, epoch) - eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, global_step, epoch) + eval_avg_loss_dict = evaluate(eval_loader , model, criterion, ap, + global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_loss'] - best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, + best_loss = save_best_model(target_loss, best_loss, model, optimizer, + global_step, epoch, c.r, OUT_PATH) diff --git a/TTS/bin/train_tacotron.py b/TTS/bin/train_tacotron.py index e8b8b8e9..53e028d3 100644 --- a/TTS/bin/train_tacotron.py +++ b/TTS/bin/train_tacotron.py @@ -9,8 +9,8 @@ from random import randrange import numpy as np import torch -from TTS.utils.arguments import parse_arguments, process_args from torch.utils.data import DataLoader +from TTS.utils.arguments import parse_arguments, process_args from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.datasets.TTSDataset import MyDataset from TTS.tts.layers.losses import TacotronLoss @@ -62,7 +62,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, dataset=None): c.use_external_speaker_embedding_file ) else None ) - ) + ) if c.use_phonemes and c.compute_input_seq_cache: # precompute phonemes to have a better estimate of sequence lengths. @@ -179,10 +179,10 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, - linear_input, stop_tokens, stop_targets, - mel_lengths, decoder_backward_output, - alignments, alignment_lengths, alignments_backward, - text_lengths) + linear_input, stop_tokens, stop_targets, + mel_lengths, decoder_backward_output, + alignments, alignment_lengths, + alignments_backward, text_lengths) # check nan loss if torch.isnan(loss_dict['loss']).any(): @@ -200,7 +200,7 @@ def train(data_loader, model, criterion, optimizer, optimizer_st, scheduler, # stopnet optimizer step if c.separate_stopnet: - scaler_st.scale( loss_dict['stopnet_loss']).backward() + scaler_st.scale(loss_dict['stopnet_loss']).backward() scaler.unscale_(optimizer_st) optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) @@ -534,8 +534,7 @@ def main(args): # pylint: disable=redefined-outer-name optimizer_st = None # setup criterion - criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) - + criterion = TacotronLoss(c, stopnet_pos_weight=c.stopnet_pos_weight, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: @@ -637,7 +636,8 @@ def main(args): # pylint: disable=redefined-outer-name epoch, c.r, OUT_PATH, - scaler=scaler.state_dict() if c.mixed_precision else None) + scaler=scaler.state_dict() if c.mixed_precision else None + ) if __name__ == '__main__': diff --git a/TTS/bin/train_vocoder_gan.py b/TTS/bin/train_vocoder_gan.py index 2c1f901a..1f2beb70 100644 --- a/TTS/bin/train_vocoder_gan.py +++ b/TTS/bin/train_vocoder_gan.py @@ -8,8 +8,8 @@ import traceback from inspect import signature import torch -from TTS.utils.arguments import parse_arguments, process_args from torch.utils.data import DataLoader +from TTS.utils.arguments import parse_arguments, process_args from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import (KeepAverage, count_parameters, remove_experiment_folder, set_init_dict) @@ -33,9 +33,8 @@ use_cuda, num_gpus = setup_torch_training_env(True, True) def setup_loader(ap, is_val=False, verbose=False): - if is_val and not c.run_eval: - loader = None - else: + loader = None + if not is_val or c.run_eval: dataset = GANDataset(ap=ap, items=eval_data if is_val else train_data, seq_len=c.seq_len, @@ -114,7 +113,7 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, y_hat = model_G(c_G) y_hat_sub = None y_G_sub = None - y_hat_vis = y_hat # for visualization # FIXME! .clone().detach() + y_hat_vis = y_hat # for visualization # PQMF formatting if y_hat.shape[1] > 1: @@ -274,14 +273,14 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, # compute spectrograms figures = plot_results(y_hat_vis, y_G, ap, global_step, - 'train') + 'train') tb_logger.tb_train_figures(global_step, figures) # Sample audio sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() tb_logger.tb_train_audios(global_step, - {'train/audio': sample_voice}, - c.audio["sample_rate"]) + {'train/audio': sample_voice}, + c.audio["sample_rate"]) end_time = time.time() # print epoch stats diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index 4ef6769c..d8dc88e1 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -8,12 +8,12 @@ import traceback import numpy as np import torch -from TTS.utils.arguments import parse_arguments, process_args # DISTRIBUTED from torch.nn.parallel import DistributedDataParallel as DDP_th from torch.optim import Adam from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler +from TTS.utils.arguments import parse_arguments, process_args from TTS.utils.audio import AudioProcessor from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import (KeepAverage, count_parameters, @@ -32,16 +32,16 @@ def setup_loader(ap, is_val=False, verbose=False): loader = None else: dataset = WaveGradDataset(ap=ap, - items=eval_data if is_val else train_data, - seq_len=c.seq_len, - hop_len=ap.hop_length, - pad_short=c.pad_short, - conv_pad=c.conv_pad, - is_training=not is_val, - return_segments=True, - use_noise_augment=False, - use_cache=c.use_cache, - verbose=verbose) + items=eval_data if is_val else train_data, + seq_len=c.seq_len, + hop_len=ap.hop_length, + pad_short=c.pad_short, + conv_pad=c.conv_pad, + is_training=not is_val, + return_segments=True, + use_noise_augment=False, + use_cache=c.use_cache, + verbose=verbose) sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, batch_size=c.batch_size, @@ -77,8 +77,8 @@ def format_test_data(data): return m, x -def train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, epoch): +def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, + epoch): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 @@ -92,7 +92,8 @@ def train(model, criterion, optimizer, c_logger.print_train_start() # setup noise schedule noise_schedule = c['train_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) else: @@ -118,7 +119,7 @@ def train(model, criterion, optimizer, # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} + loss_wavegrad_dict = {'wavegrad_loss': loss} # check nan loss if torch.isnan(loss).any(): @@ -131,13 +132,13 @@ def train(model, criterion, optimizer, scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) scaler.step(optimizer) scaler.update() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), - c.clip_grad) + c.clip_grad) optimizer.step() # schedule update @@ -193,17 +194,19 @@ def train(model, criterion, optimizer, if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None + ) end_time = time.time() @@ -250,7 +253,7 @@ def evaluate(model, criterion, ap, global_step, epoch): # compute losses loss = criterion(noise, noise_hat) - loss_wavegrad_dict = {'wavegrad_loss':loss} + loss_wavegrad_dict = {'wavegrad_loss': loss} loss_dict = dict() @@ -282,7 +285,9 @@ def evaluate(model, criterion, ap, global_step, epoch): # setup noise schedule and inference noise_schedule = c['test_noise_schedule'] - betas = np.linspace(noise_schedule['min_val'], noise_schedule['max_val'], noise_schedule['num_steps']) + betas = np.linspace(noise_schedule['min_val'], + noise_schedule['max_val'], + noise_schedule['num_steps']) if hasattr(model, 'module'): model.module.compute_noise_level(betas) # compute voice @@ -313,7 +318,8 @@ def main(args): # pylint: disable=redefined-outer-name print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: print(f" > Loading features from: {c.feature_path}") - eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) + eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, + c.eval_split_size) else: eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) @@ -343,6 +349,10 @@ def main(args): # pylint: disable=redefined-outer-name # setup criterion criterion = torch.nn.L1Loss().cuda() + if use_cuda: + model.cuda() + criterion.cuda() + if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: @@ -376,10 +386,6 @@ def main(args): # pylint: disable=redefined-outer-name else: args.restore_step = 0 - if use_cuda: - model.cuda() - criterion.cuda() - # DISTRUBUTED if num_gpus > 1: model = DDP_th(model, device_ids=[args.rank]) @@ -393,26 +399,26 @@ def main(args): # pylint: disable=redefined-outer-name global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) - _, global_step = train(model, criterion, optimizer, - scheduler, scaler, ap, global_step, - epoch) - eval_avg_loss_dict = evaluate(model, criterion, ap, - global_step, epoch) + _, global_step = train(model, criterion, optimizer, scheduler, scaler, + ap, global_step, epoch) + eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = eval_avg_loss_dict[c.target_loss] - best_loss = save_best_model(target_loss, - best_loss, - model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=eval_avg_loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None) + best_loss = save_best_model( + target_loss, + best_loss, + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=eval_avg_loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None + ) if __name__ == '__main__': diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index e32301fc..b4ffe143 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -178,18 +178,19 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch if global_step % c.save_step == 0: if c.checkpoint: # save model - save_checkpoint(model, - optimizer, - scheduler, - None, - None, - None, - global_step, - epoch, - OUT_PATH, - model_losses=loss_dict, - scaler=scaler.state_dict() if c.mixed_precision else None - ) + save_checkpoint( + model, + optimizer, + scheduler, + None, + None, + None, + global_step, + epoch, + OUT_PATH, + model_losses=loss_dict, + scaler=scaler.state_dict() if c.mixed_precision else None + ) # synthesize a full voice rand_idx = random.randrange(0, len(train_data)) @@ -204,14 +205,7 @@ def train(model, optimizer, criterion, scheduler, scaler, ap, global_step, epoch c.batched, c.target_samples, c.overlap_samples, - # use_cuda ) - # sample_wav = model.generate(ground_mel, - # c.batched, - # c.target_samples, - # c.overlap_samples, - # use_cuda - # ) predict_mel = ap.melspectrogram(sample_wav) # compute spectrograms @@ -300,7 +294,6 @@ def evaluate(model, criterion, ap, global_step, epoch): c.batched, c.target_samples, c.overlap_samples, - # use_cuda ) predict_mel = ap.melspectrogram(sample_wav) @@ -311,9 +304,10 @@ def evaluate(model, criterion, ap, global_step, epoch): ) # compute spectrograms - figures = {"eval/ground_truth": plot_spectrogram(ground_mel.T), - "eval/prediction": plot_spectrogram(predict_mel.T) - } + figures = { + "eval/ground_truth": plot_spectrogram(ground_mel.T), + "eval/prediction": plot_spectrogram(predict_mel.T) + } tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)