config.json update to set model architecture and tacotron2 training parameters

2019-03-06 13:09:21 +01:00 · 2019-03-06 13:09:21 +01:00 · 007bef5c35
parent 5133db82ad
commit 007bef5c35
1 changed files with 18 additions and 16 deletions
--- a/config.json
+++ b/config.json
@ -1,6 +1,6 @@
 {
-    "model_name": "queue",
-    "model_description": "Queue memory and change lower r incrementatlly",
+    "run_name": "queue",
+    "run_description": "Queue memory and change lower r incrementatlly",

    "audio":{
        // Audio processing parameters
@ -19,9 +19,9 @@
        "symmetric_norm": false, // move normalization to range [-1, 1]
        "max_norm": 1,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": null,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": null,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": true  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": false  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
    },

    "distributed":{
@ -29,22 +29,23 @@
        "url": "tcp:\/\/localhost:54321"
    },

-    "text_cleaner": "phoneme_cleaners",
+    "model": "Tacotron",   // one of the model in models/    
+    "grad_clip": 0.02,      // upper limit for gradients for clipping.
    "epochs": 1000,         // total number of epochs to train.
    "lr": 0.0001,            // Initial learning rate. If Noam decay is active, maximum learning rate.
    "lr_decay": false,      // if true, Noam learning rate decaying is applied through training.
-    "loss_weight": 0.0,     // loss weight to emphasize lower frequencies. Lower frequencies are in general more important for speech signals.
    "warmup_steps": 4000,   // Noam decay steps to increase the learning rate from 0 to "lr"
    "windowing": false,      // Enables attention windowing. Used only in eval mode.
-    "memory_size": 5,       // memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
+    "memory_size": 5,       // TO BE IMPLEMENTED -- memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
+    "batch_group_size": 3,

-    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
-    "eval_batch_size":32,   
-    "r": 5,                 // Number of frames to predict for step.
-    "wd": 0.00001,          // Weight decay weight.
+    "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
+    "eval_batch_size":16,   
+    "r": 1,                 // Number of frames to predict for step.
+    "wd": 0.000005,         // Weight decay weight.
    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
-    "save_step": 5000,      // Number of training steps expected to save traning stats and checkpoints.
-    "print_step": 50,       // Number of steps to log traning on console.
+    "save_step": 1000,      // Number of training steps expected to save traning stats and checkpoints.
+    "print_step": 10,       // Number of steps to log traning on console.
    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
    "batch_group_size": 8,  //Number of batches to shuffle after bucketing.

@ -55,11 +56,12 @@
    "meta_file_val": "metadata_val.csv",    // DATASET-RELATED: metafile for evaluation dataloader.
    "dataset": "ljspeech",      // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
    "min_seq_len": 0,       // DATASET-RELATED: minimum text length to use in training
-    "max_seq_len": 300,     // DATASET-RELATED: maximum text length
+    "max_seq_len": 1000,     // DATASET-RELATED: maximum text length
    "output_path": "/media/erogol/data_ssd/Data/models/ljspeech_models/",      // DATASET-RELATED: output path for all training outputs.
    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
    "phoneme_cache_path": "ljspeech_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
-    "phoneme_language": "en-us"     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+    "text_cleaner": "phoneme_cleaners"
 }