diff --git a/TTS/bin/train_vocoder_wavegrad.py b/TTS/bin/train_vocoder_wavegrad.py index 1f039a67..c0fcff51 100644 --- a/TTS/bin/train_vocoder_wavegrad.py +++ b/TTS/bin/train_vocoder_wavegrad.py @@ -15,7 +15,7 @@ from torch.optim import Adam from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler -from TTS.utils.arguments import parse_arguments, process_args +from TTS.utils.arguments import init_training from TTS.utils.audio import AudioProcessor from TTS.utils.distribute import init_distributed from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict @@ -131,12 +131,12 @@ def train(model, criterion, optimizer, scheduler, scaler, ap, global_step, epoch if c.mixed_precision: scaler.scale(loss).backward() scaler.unscale_(optimizer) - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), c.clip_grad) + grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), c.grad_clip) scaler.step(optimizer) scaler.update() else: loss.backward() - grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), c.clip_grad) + grad_norm = torch.nn.utils.grad_clip_norm_(model.parameters(), c.clip_grad) optimizer.step() # schedule update @@ -311,7 +311,7 @@ def main(args): # pylint: disable=redefined-outer-name eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) # setup audio processor - ap = AudioProcessor(**c.audio) + ap = AudioProcessor(**c.audio.to_dict()) # DISTRUBUTED if num_gpus > 1: @@ -416,9 +416,7 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == "__main__": - args = parse_arguments(sys.argv) - c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(args, model_class="vocoder") - + args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) try: main(args) except KeyboardInterrupt: diff --git a/TTS/bin/train_vocoder_wavernn.py b/TTS/bin/train_vocoder_wavernn.py index 8a2d8d3a..bcad9493 100644 --- a/TTS/bin/train_vocoder_wavernn.py +++ b/TTS/bin/train_vocoder_wavernn.py @@ -11,7 +11,7 @@ import torch from torch.utils.data import DataLoader from TTS.tts.utils.visual import plot_spectrogram -from TTS.utils.arguments import parse_arguments, process_args +from TTS.utils.arguments import init_training from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import KeepAverage, count_parameters, remove_experiment_folder, set_init_dict from TTS.utils.radam import RAdam @@ -307,29 +307,7 @@ def main(args): # pylint: disable=redefined-outer-name global train_data, eval_data # setup audio processor - ap = AudioProcessor(**c.audio) - - # print(f" > Loading wavs from: {c.data_path}") - # if c.feature_path is not None: - # print(f" > Loading features from: {c.feature_path}") - # eval_data, train_data = load_wav_feat_data( - # c.data_path, c.feature_path, c.eval_split_size - # ) - # else: - # mel_feat_path = os.path.join(OUT_PATH, "mel") - # feat_data = find_feat_files(mel_feat_path) - # if feat_data: - # print(f" > Loading features from: {mel_feat_path}") - # eval_data, train_data = load_wav_feat_data( - # c.data_path, mel_feat_path, c.eval_split_size - # ) - # else: - # print(" > No feature data found. Preprocessing...") - # # preprocessing feature data from given wav files - # preprocess_wav_files(OUT_PATH, CONFIG, ap) - # eval_data, train_data = load_wav_feat_data( - # c.data_path, mel_feat_path, c.eval_split_size - # ) + ap = AudioProcessor(**c.audio.to_dict()) print(f" > Loading wavs from: {c.data_path}") if c.feature_path is not None: @@ -438,9 +416,7 @@ def main(args): # pylint: disable=redefined-outer-name if __name__ == "__main__": - args = parse_arguments(sys.argv) - c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(args, model_class="vocoder") - + args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv) try: main(args) except KeyboardInterrupt: diff --git a/TTS/vocoder/configs/hifigan_ljspeech.json b/TTS/vocoder/configs/hifigan_ljspeech.json deleted file mode 100644 index 23cbf3f8..00000000 --- a/TTS/vocoder/configs/hifigan_ljspeech.json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "run_name": "hifigan", - "run_description": "hifigan mean-var scaling", - - // AUDIO PARAMETERS - "audio":{ - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - - // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "log_func": "np.log10", - "do_sound_norm": true, - - // Silence trimming - "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - - // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. - - // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/.local/share/tts/tts_models--en--ljspeech--speedy-speech-wn/scale_stats.npy" - }, - - // DISTRIBUTED TRAINING - "distributed":{ - "backend": "nccl", - "url": "tcp:\/\/localhost:54324" - }, - - // MODEL PARAMETERS - "use_pqmf": false, - - // LOSS PARAMETERS - "use_stft_loss": false, - "use_subband_stft_loss": false, - "use_mse_gan_loss": true, - "use_hinge_gan_loss": false, - "use_feat_match_loss": true, // use only with melgan discriminators - "use_l1_spec_loss": true, - - // loss weights - "stft_loss_weight": 0, - "subband_stft_loss_weight": 0, - "mse_G_loss_weight": 1, - "hinge_G_loss_weight": 0, - "feat_match_loss_weight": 10, - "l1_spec_loss_weight": 45, - - // multiscale stft loss parameters - // "stft_loss_params": { - // "n_ffts": [1024, 2048, 512], - // "hop_lengths": [120, 240, 50], - // "win_lengths": [600, 1200, 240] - // }, - - "l1_spec_loss_params": { - "use_mel": true, - "sample_rate": 22050, - "n_fft": 1024, - "hop_length": 256, - "win_length": 1024, - "n_mels": 80, - "mel_fmin": 0.0, - "mel_fmax": null - }, - - "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch - - // DISCRIMINATOR - "discriminator_model": "hifigan_discriminator", - //"discriminator_model_params":{ - // "peroids": [2, 3, 5, 7, 11], - // "base_channels": 16, - // "max_channels":512, - // "downsample_factors":[4, 4, 4] - //}, - "steps_to_start_discriminator": 0, // steps required to start GAN trainining.1 - "diff_samples_for_G_and_D": false, // draw a new sample from the dataset for the D pass. - - // GENERATOR - "generator_model": "hifigan_generator", - "generator_model_params": { - "upsample_factors":[8,8,2,2], - "upsample_kernel_sizes": [16,16,4,4], - "upsample_initial_channel": 512, - "resblock_kernel_sizes": [3,7,11], - "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "resblock_type": "1" - }, - - // DATASET - "data_path": "/home/erogol/gdrive/Datasets/LJSpeech-1.1/wavs/", - "feature_path": null, - // "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/", - "seq_len": 8192, - "pad_short": 2000, - "conv_pad": 0, - "use_noise_augment": false, - "use_cache": true, - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // OPTIMIZER - "epochs": 10000, // total number of epochs to train. - "wd": 0.0, // Weight decay weight. - "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 - "disc_clip_grad": -1, // Discriminator gradient clipping threshold. - "lr_gen": 0.0002, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_disc": 0.0002, - "optimizer": "AdamW", - "optimizer_params":{ - "betas": [0.8, 0.99], - "weight_decay": 0.0 - }, - "lr_scheduler_gen": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate - "lr_scheduler_gen_params": { - "gamma": 0.999, - "last_epoch": -1 - }, - "lr_scheduler_disc": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate - "lr_scheduler_disc_params": { - "gamma": 0.999, - "last_epoch": -1 - }, - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. - "print_eval": false, // If True, it prints loss values for each step in eval run. - "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 10, - - // PATHS - "output_path": "/home/erogol/gdrive/Trainings/LJSpeech/" -} - -