From 228ada04b5e551939da8019d7d43a32dbbd7ca36 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 5 Jan 2021 14:21:12 +0100 Subject: [PATCH] update glow-tts ljspeech config --- ..._tts_tdsep.json => glow_tts_ljspeech.json} | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) rename TTS/tts/configs/{glow_tts_tdsep.json => glow_tts_ljspeech.json} (90%) diff --git a/TTS/tts/configs/glow_tts_tdsep.json b/TTS/tts/configs/glow_tts_ljspeech.json similarity index 90% rename from TTS/tts/configs/glow_tts_tdsep.json rename to TTS/tts/configs/glow_tts_ljspeech.json index 72eb3da7..636d9313 100644 --- a/TTS/tts/configs/glow_tts_tdsep.json +++ b/TTS/tts/configs/glow_tts_ljspeech.json @@ -31,12 +31,12 @@ "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.00 // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. "min_level_db": -100, // lower bound for normalization "symmetric_norm": true, // move normalization to range [-1, 1] "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored }, // VOCABULARY PARAMETERS @@ -63,6 +63,19 @@ // MODEL PARAMETERS // "use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments. + "hidden_channels_encoder": 192, + "hidden_channels_decoder": 192, + "hidden_channels_duration_predictor": 256, + "use_encoder_prenet": true, + "encoder_type": "rel_pos_transformer", + "encoder_params": { + "kernel_size":3, + "dropout_p": 0.1, + "num_layers": 6, + "num_heads": 2, + "hidden_channels_ffn": 768, + "input_length": null + }, // TRAINING "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. @@ -86,8 +99,6 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. - "encoder_type": "residual_conv_bn", - // TENSORBOARD and LOGGING "print_step": 25, // Number of steps to log training on console. "tb_plot_step": 100, // Number of steps to plot TB training figures. @@ -105,6 +116,7 @@ "min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 500, // DATASET-RELATED: maximum text length "compute_f0": false, // compute f0 values in data-loader + "use_noise_augment": true, //add a random noise to audio signal for augmentation at training . "compute_input_seq_cache": true, // PATHS