From 4a5b1d4ac235793393e6ce60c266d70182d75c83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 6 Apr 2021 11:01:21 +0200 Subject: [PATCH] update hifigan config --- TTS/vocoder/configs/modified_hifigan.json | 39 ++++++++++++++++------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/TTS/vocoder/configs/modified_hifigan.json b/TTS/vocoder/configs/modified_hifigan.json index c50a575b..e945635a 100644 --- a/TTS/vocoder/configs/modified_hifigan.json +++ b/TTS/vocoder/configs/modified_hifigan.json @@ -11,12 +11,13 @@ "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "log_func": "np.log", // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) "trim_db": 60, // threshold for timming silence. Set this according to your dataset. // MelSpectrogram parameters @@ -26,7 +27,7 @@ "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. "min_level_db": -100, // lower bound for normalization "symmetric_norm": true, // move normalization to range [-1, 1] "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] @@ -44,24 +45,37 @@ "use_pqmf": false, // LOSS PARAMETERS - "use_stft_loss": true, + "use_stft_loss": false, "use_subband_stft_loss": false, "use_mse_gan_loss": true, "use_hinge_gan_loss": false, "use_feat_match_loss": true, // use only with melgan discriminators + "use_l1_spec_loss": true, // loss weights - "stft_loss_weight": 45, + "stft_loss_weight": 0, "subband_stft_loss_weight": 0, "mse_G_loss_weight": 1, "hinge_G_loss_weight": 0, "feat_match_loss_weight": 10, + "l1_spec_loss_weight": 45, // multiscale stft loss parameters - "stft_loss_params": { - "n_ffts": [1024, 2048, 512], - "hop_lengths": [120, 240, 50], - "win_lengths": [600, 1200, 240] + // "stft_loss_params": { + // "n_ffts": [1024, 2048, 512], + // "hop_lengths": [120, 240, 50], + // "win_lengths": [600, 1200, 240] + // }, + + "l1_spec_loss_params": { + "use_mel": true, + "sample_rate": 16000, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": null }, "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch @@ -89,8 +103,9 @@ // DATASET "data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/", - "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/", - "seq_len": 16384, + "feature_path": null, + // "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/", + "seq_len": 8192, "pad_short": 2000, "conv_pad": 0, "use_noise_augment": false,