Merge branch 'dev-bos' into dev-tacotron2

2019-03-07 11:45:42 +01:00 · 2019-03-07 11:45:42 +01:00 · cf11b6c23c
parent 112fe0dc4d 772c859dff
commit cf11b6c23c
7 changed files with 42 additions and 36 deletions
--- a/.compute
+++ b/.compute
@ -3,5 +3,5 @@ ls ${SHARED_DIR}/data/keithito
 pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
 yes | apt-get install espeak 
 python3 setup.py develop
-# python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/  --restore_path ${USER_DIR}/best_model.pth.tar
+python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/  --restore_path ${USER_DIR}/best_model.pth.tar
-python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/
+# python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/
--- a/config.json
+++ b/config.json
@ -1,6 +1,6 @@
 {
-    "run_name": "queue",
+    "run_name": "bos",
-    "run_description": "Queue memory and change lower r incrementatlly",
+    "run_description": "bos character added to get away with the first char miss",
    "audio":{
        // Audio processing parameters
@ -29,7 +29,7 @@
        "url": "tcp:\/\/localhost:54321"
    },
-    "model": "Tacotron",    // one of the model in models/. For now "Tacotron" or "Tacotron2" are available models.  
+    "model": "Tacotron2",   // one of the model in models/    
    "grad_clip": 0.02,      // upper limit for gradients for clipping.
    "epochs": 1000,         // total number of epochs to train.
    "lr": 0.0001,            // Initial learning rate. If Noam decay is active, maximum learning rate.
@ -37,12 +37,11 @@
    "warmup_steps": 4000,   // Noam decay steps to increase the learning rate from 0 to "lr"
    "windowing": false,      // Enables attention windowing. Used only in eval mode.
    "memory_size": 5,       // TO BE IMPLEMENTED -- memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
    "batch_group_size": 3,
    "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
    "eval_batch_size":16,   
    "r": 1,                 // Number of frames to predict for step.
-    "wd": 0.000005,         // Weight decay weight.
+    "wd": 0.000002,         // Weight decay weight.
    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
    "save_step": 1000,      // Number of training steps expected to save traning stats and checkpoints.
    "print_step": 10,       // Number of steps to log traning on console.
--- a/config_cluster.json
+++ b/config_cluster.json
@ -1,9 +1,8 @@
 {
-    "model_name": "tts-master",
+    "run_name": "bos",
-    "model_description": "tts master cluster test",
+    "run_description": "bos character added to get away with the first char miss",
    "audio":{
        "audio_processor": "audio",     // to use dictate different audio processors, if available.
        // Audio processing parameters
        "num_mels": 80,         // size of the mel spec frame. 
        "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
@ -20,41 +19,48 @@
        "symmetric_norm": false, // move normalization to range [-1, 1]
        "max_norm": 1,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": null,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": null,        // maximum freq level for mel-spec. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": true  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "do_trim_silence": false  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
    },
-    "embedding_size": 256,  // Character embedding vector length. You don't need to change it in general.
+    "distributed":{
-    "text_cleaner": "phoneme_cleaners",
+        "backend": "nccl",
        "url": "tcp:\/\/localhost:54321"
    },
    "model": "Tacotron2",   // one of the model in models/    
    "grad_clip": 0.05,      // upper limit for gradients for clipping.
    "epochs": 1000,         // total number of epochs to train.
    "lr": 0.0001,            // Initial learning rate. If Noam decay is active, maximum learning rate.
    "lr_decay": false,      // if true, Noam learning rate decaying is applied through training.
    "loss_weight": 0.0,     // loss weight to emphasize lower frequencies. Lower frequencies are in general more important for speech signals.
    "warmup_steps": 4000,   // Noam decay steps to increase the learning rate from 0 to "lr"
    "windowing": false,      // Enables attention windowing. Used only in eval mode.
-    "memory_size": 5,       // memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
+    "memory_size": 5,       // TO BE IMPLEMENTED -- memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
-    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
+    "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
-    "eval_batch_size":32,   
+    "eval_batch_size":16,   
-    "r": 5,                 // Number of frames to predict for step.
+    "r": 1,                 // Number of frames to predict for step.
-    "wd": 0.00001,         // Weight decay weight.
+    "wd": 0.000002,         // Weight decay weight.
    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
-    "save_step": 5000,      // Number of training steps expected to save traning stats and checkpoints.
+    "save_step": 1000,      // Number of training steps expected to save traning stats and checkpoints.
-    "print_step": 50,       // Number of steps to log traning on console.
+    "print_step": 10,       // Number of steps to log traning on console.
    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
    "batch_group_size": 8,  //Number of batches to shuffle after bucketing.
    "run_eval": true,
    "test_delay_epochs": 100,  //Until attention is aligned, testing only wastes computation time.
    "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1",  // DATASET-RELATED: can overwritten from command argument
-    "meta_file_train": "prompts_train.data",      // DATASET-RELATED: metafile for training dataloader.
+    "meta_file_train": "metadata_train.csv",      // DATASET-RELATED: metafile for training dataloader.
-    "meta_file_val": "prompts_val.data",    // DATASET-RELATED: metafile for evaluation dataloader.
+    "meta_file_val": "metadata_val.csv",    // DATASET-RELATED: metafile for evaluation dataloader.
-    "dataset": "nancy",      // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
+    "dataset": "ljspeech",      // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
    "min_seq_len": 0,       // DATASET-RELATED: minimum text length to use in training
-    "max_seq_len": 300,     // DATASET-RELATED: maximum text length
+    "max_seq_len": 1000,     // DATASET-RELATED: maximum text length
-    "output_path": "models/",      // DATASET-RELATED: output path for all training outputs.
+    "output_path": "../keep/",      // DATASET-RELATED: output path for all training outputs.
    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
-    "phoneme_cache_path": "phonemes_cache",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "phoneme_cache_path": "ljspeech_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
-    "phoneme_language": "en-us"     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
    "text_cleaner": "phoneme_cleaners"
 }
--- a/distribute.py
+++ b/distribute.py
@ -131,7 +131,7 @@ def main(args):
    Call train.py as a new process and pass command arguments
    """
    CONFIG = load_config(args.config_path)
-    OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.model_name,
+    OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
                                        True)
    stdout_path = os.path.join(OUT_PATH, "process_stdout/")
--- a/train.py
+++ b/train.py
@ -425,7 +425,7 @@ def main(args):
        print(
            " > Model restored from step %d" % checkpoint['step'], flush=True)
        start_epoch = checkpoint['epoch']
-        best_loss = checkpoint['postnet_loss']
+        # best_loss = checkpoint['postnet_loss']
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0
--- a/utils/text/init.py
+++ b/utils/text/init.py
@ -43,7 +43,7 @@ def phoneme_to_sequence(text, cleaner_names, language):
    '''
    TODO: This ignores punctuations
    '''
-    sequence = []
+    sequence = [_phonemes_to_id['^']]
    clean_text = _clean_text(text, cleaner_names)
    phonemes = text2phone(clean_text, language)
 #     print(phonemes.replace('|', ''))
@ -81,7 +81,7 @@ def text_to_sequence(text, cleaner_names):
        List of integers corresponding to the symbols in the text
    '''
    sequence = []
-
+    sequence = [_phonemes_to_id['^']]
    # Check for curly braces and treat their contents as ARPAbet:
    while len(text):
        m = _curly_re.match(text)
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@ -9,6 +9,7 @@ from utils.text import cmudict
 _pad = '_'
 _eos = '~'
 _bos = '^'
 _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
 _punctuations = '!\'(),-.:;? '
 _phoneme_punctuations = '.!;:,?'
@ -24,8 +25,8 @@ _phonemes = sorted(list(set(_phonemes)))
 _arpabet = ['@' + s for s in _phonemes]
 # Export all symbols:
-symbols = [_pad, _eos] + list(_characters) + _arpabet
+symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet
-phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations)
+phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations)
 if __name__ == '__main__':
    print(" > TTS symbols ")