From 5754116c19f8cabdaa0b55cc1d134d82301c3910 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Wed, 6 Mar 2019 22:06:01 +0100
Subject: [PATCH 1/2] bos char addded

---
 .compute               |  4 ++--
 config.json            |  9 ++++----
 config_cluster.json    | 52 +++++++++++++++++++++++-------------------
 distribute.py          |  2 +-
 train.py               |  2 +-
 utils/text/__init__.py |  4 ++--
 utils/text/symbols.py  |  5 ++--
 7 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/.compute b/.compute
index 99e7fad2..8e2ec00e 100644
--- a/.compute
+++ b/.compute
@@ -3,5 +3,5 @@ ls ${SHARED_DIR}/data/keithito
 pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
 yes | apt-get install espeak 
 python3 setup.py develop
-# python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/  --restore_path ${USER_DIR}/best_model.pth.tar
-python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/
+python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/  --restore_path ${USER_DIR}/best_model.pth.tar
+# python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/
diff --git a/config.json b/config.json
index b4037a1d..138d3436 100644
--- a/config.json
+++ b/config.json
@@ -1,6 +1,6 @@
 {
-    "run_name": "queue",
-    "run_description": "Queue memory and change lower r incrementatlly",
+    "run_name": "bos",
+    "run_description": "bos character added to get away with the first char miss",
 
     "audio":{
         // Audio processing parameters
@@ -29,7 +29,7 @@
         "url": "tcp:\/\/localhost:54321"
     },
 
-    "model": "Tacotron",   // one of the model in models/    
+    "model": "Tacotron2",   // one of the model in models/    
     "grad_clip": 0.02,      // upper limit for gradients for clipping.
     "epochs": 1000,         // total number of epochs to train.
     "lr": 0.0001,            // Initial learning rate. If Noam decay is active, maximum learning rate.
@@ -37,12 +37,11 @@
     "warmup_steps": 4000,   // Noam decay steps to increase the learning rate from 0 to "lr"
     "windowing": false,      // Enables attention windowing. Used only in eval mode.
     "memory_size": 5,       // TO BE IMPLEMENTED -- memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
-    "batch_group_size": 3,
 
     "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
     "eval_batch_size":16,   
     "r": 1,                 // Number of frames to predict for step.
-    "wd": 0.000005,         // Weight decay weight.
+    "wd": 0.000002,         // Weight decay weight.
     "checkpoint": true,     // If true, it saves checkpoints per "save_step"
     "save_step": 1000,      // Number of training steps expected to save traning stats and checkpoints.
     "print_step": 10,       // Number of steps to log traning on console.
diff --git a/config_cluster.json b/config_cluster.json
index efe53c8a..818036d7 100644
--- a/config_cluster.json
+++ b/config_cluster.json
@@ -1,9 +1,8 @@
 {
-    "model_name": "tts-master",
-    "model_description": "tts master cluster test",
+    "run_name": "bos",
+    "run_description": "bos character added to get away with the first char miss",
 
     "audio":{
-        "audio_processor": "audio",     // to use dictate different audio processors, if available.
         // Audio processing parameters
         "num_mels": 80,         // size of the mel spec frame. 
         "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
@@ -20,41 +19,48 @@
         "symmetric_norm": false, // move normalization to range [-1, 1]
         "max_norm": 1,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
         "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": null,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": null,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": true  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": false  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
     },
 
-    "embedding_size": 256,  // Character embedding vector length. You don't need to change it in general.
-    "text_cleaner": "phoneme_cleaners",
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54321"
+    },
+
+    "model": "Tacotron2",   // one of the model in models/    
+    "grad_clip": 0.05,      // upper limit for gradients for clipping.
     "epochs": 1000,         // total number of epochs to train.
     "lr": 0.0001,            // Initial learning rate. If Noam decay is active, maximum learning rate.
     "lr_decay": false,      // if true, Noam learning rate decaying is applied through training.
-    "loss_weight": 0.0,     // loss weight to emphasize lower frequencies. Lower frequencies are in general more important for speech signals.
     "warmup_steps": 4000,   // Noam decay steps to increase the learning rate from 0 to "lr"
     "windowing": false,      // Enables attention windowing. Used only in eval mode.
-    "memory_size": 5,       // memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
+    "memory_size": 5,       // TO BE IMPLEMENTED -- memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
 
-    "batch_size": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
-    "eval_batch_size":32,   
-    "r": 5,                 // Number of frames to predict for step.
-    "wd": 0.00001,         // Weight decay weight.
+    "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
+    "eval_batch_size":16,   
+    "r": 1,                 // Number of frames to predict for step.
+    "wd": 0.000002,         // Weight decay weight.
     "checkpoint": true,     // If true, it saves checkpoints per "save_step"
-    "save_step": 5000,      // Number of training steps expected to save traning stats and checkpoints.
-    "print_step": 50,       // Number of steps to log traning on console.
+    "save_step": 1000,      // Number of training steps expected to save traning stats and checkpoints.
+    "print_step": 10,       // Number of steps to log traning on console.
     "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
+    "batch_group_size": 8,  //Number of batches to shuffle after bucketing.
 
     "run_eval": true,
+    "test_delay_epochs": 100,  //Until attention is aligned, testing only wastes computation time.
     "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1",  // DATASET-RELATED: can overwritten from command argument
-    "meta_file_train": "prompts_train.data",      // DATASET-RELATED: metafile for training dataloader.
-    "meta_file_val": "prompts_val.data",    // DATASET-RELATED: metafile for evaluation dataloader.
-    "dataset": "nancy",      // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
+    "meta_file_train": "metadata_train.csv",      // DATASET-RELATED: metafile for training dataloader.
+    "meta_file_val": "metadata_val.csv",    // DATASET-RELATED: metafile for evaluation dataloader.
+    "dataset": "ljspeech",      // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
     "min_seq_len": 0,       // DATASET-RELATED: minimum text length to use in training
-    "max_seq_len": 300,     // DATASET-RELATED: maximum text length
-    "output_path": "models/",      // DATASET-RELATED: output path for all training outputs.
+    "max_seq_len": 1000,     // DATASET-RELATED: maximum text length
+    "output_path": "../keep/",      // DATASET-RELATED: output path for all training outputs.
     "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
     "num_val_loader_workers": 4,    // number of evaluation data loader processes.
-    "phoneme_cache_path": "phonemes_cache",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "phoneme_cache_path": "ljspeech_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
     "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
-    "phoneme_language": "en-us"     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+    "text_cleaner": "phoneme_cleaners"
 }
diff --git a/distribute.py b/distribute.py
index f4538cfd..c2f786fe 100644
--- a/distribute.py
+++ b/distribute.py
@@ -131,7 +131,7 @@ def main(args):
     Call train.py as a new process and pass command arguments
     """
     CONFIG = load_config(args.config_path)
-    OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.model_name,
+    OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name,
                                         True)
     stdout_path = os.path.join(OUT_PATH, "process_stdout/")
 
diff --git a/train.py b/train.py
index a0ee76c1..5aed4073 100644
--- a/train.py
+++ b/train.py
@@ -425,7 +425,7 @@ def main(args):
         print(
             " > Model restored from step %d" % checkpoint['step'], flush=True)
         start_epoch = checkpoint['epoch']
-        best_loss = checkpoint['postnet_loss']
+        # best_loss = checkpoint['postnet_loss']
         args.restore_step = checkpoint['step']
     else:
         args.restore_step = 0
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index 76993d50..c66bce91 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -43,7 +43,7 @@ def phoneme_to_sequence(text, cleaner_names, language):
     '''
     TODO: This ignores punctuations
     '''
-    sequence = []
+    sequence = [_phonemes_to_id['^']]
     clean_text = _clean_text(text, cleaner_names)
     phonemes = text2phone(clean_text, language)
 #     print(phonemes.replace('|', ''))
@@ -81,7 +81,7 @@ def text_to_sequence(text, cleaner_names):
         List of integers corresponding to the symbols in the text
     '''
     sequence = []
-
+    sequence = [_phonemes_to_id['^']]
     # Check for curly braces and treat their contents as ARPAbet:
     while len(text):
         m = _curly_re.match(text)
diff --git a/utils/text/symbols.py b/utils/text/symbols.py
index a25f7c99..5fc20a5f 100644
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@@ -9,6 +9,7 @@ from utils.text import cmudict
 
 _pad = '_'
 _eos = '~'
+_bos = '^'
 _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
 _punctuations = '!\'(),-.:;? '
 _phoneme_punctuations = '.!;:,?'
@@ -24,8 +25,8 @@ _phonemes = sorted(list(set(_phonemes)))
 _arpabet = ['@' + s for s in _phonemes]
 
 # Export all symbols:
-symbols = [_pad, _eos] + list(_characters) + _arpabet
-phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations)
+symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet
+phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations)
 
 if __name__ == '__main__':
     print(" > TTS symbols ")

From 772c859dffa2fe60f0a4d2fb7778be5344646e84 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Thu, 7 Mar 2019 11:44:17 +0100
Subject: [PATCH 2/2] Chnage stop condition

---
 layers/tacotron2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/layers/tacotron2.py b/layers/tacotron2.py
index 296ea7ec..e743186f 100644
--- a/layers/tacotron2.py
+++ b/layers/tacotron2.py
@@ -355,7 +355,7 @@ class Decoder(nn.Module):
             alignments += [alignment]
 
             stop_flags[0] = stop_flags[0] or gate_output > 0.5
-            stop_flags[1] = stop_flags[1] or alignment[0, -3:].sum() > 0.5
+            stop_flags[1] = stop_flags[1] or alignment[0, -2:].sum() > 0.5
             if all(stop_flags):
                 break
             elif len(outputs) == self.max_decoder_steps: