config update for mean-var scaling

2020-03-17 12:44:18 +01:00 · 2020-03-17 12:44:18 +01:00 · 0ee1dd54a3
parent 069c8e4315
commit 0ee1dd54a3
1 changed files with 35 additions and 25 deletions
--- a/config.json
+++ b/config.json
@ -1,45 +1,55 @@
 {
-    "model": "Tacotron2",          // one of the model in models/  
+    "model": "Tacotron2",   
    "run_name": "ljspeech",
    "run_description": "tacotron2 with guided attention and -1 1 normalization and no preemphasis",

    // AUDIO PARAMETERS
    "audio":{
+        // stft parameters
+        "num_freq": 513,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        
        // Audio processing parameters
-        "num_mels": 80,         // size of the mel spec frame. 
-        "num_freq": 513,       // number of stft frequency levels. Size of the linear spectogram frame.
        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 1024,     // stft window length in ms.
-        "hop_length": 256,      // stft window hop-lengh in ms.
-        "preemphasis": 0.0,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "min_level_db": -100,   // normalization range
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // Griffin-Lim
        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame. 
+        "mel_fmin": 0.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,     // maximum freq level for mel-spec. Tune for dataset!!
+
        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 1.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "max_norm": 1.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60          // threshold for timming silence. Set this according to your dataset.
+        "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy"    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
    },

    // VOCABULARY PARAMETERS
    // if custom character set is not defined,
    // default set in symbols.py is used
-    "characters":{
-        "pad": "_",
-        "eos": "~",
-        "bos": "^",
-        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
-        "punctuations":"!'(),-.:;? ",
-        "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
-    },
+    // "characters":{
+    //     "pad": "_",
+    //     "eos": "~",
+    //     "bos": "^",
+    //     "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+    //     "punctuations":"!'(),-.:;? ",
+    //     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    // },
    
    // DISTRIBUTED TRAINING
    "distributed":{
@ -107,7 +117,7 @@
    "max_seq_len": 153,     // DATASET-RELATED: maximum text length

    // PATHS
-    "output_path": "/data4/rw/home/Trainings/",
+    "output_path": "/home/erogol/Models/LJSpeech/",
 
    // PHONEMES
    "phoneme_cache_path": "mozilla_us_phonemes_3",  // phoneme computation is slow, therefore, it caches results in the given folder.
@ -124,7 +134,7 @@
        [
            {
                "name": "ljspeech",
-                "path": "/root/LJSpeech-1.1/",
+                "path": "/home/erogol/Data/LJSpeech-1.1/",
                "meta_file_train": "metadata.csv",
                "meta_file_val": null
            }