From 9915220539af1c58dac2d1a93b25caeb8d898751 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Thu, 18 Apr 2019 17:34:54 +0200
Subject: [PATCH 1/7] config upate

---
 .compute            | 10 +++++++---
 config_cluster.json | 18 +++++++++---------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/.compute b/.compute
index c559bcab..3e21a5bf 100644
--- a/.compute
+++ b/.compute
@@ -1,7 +1,11 @@
 #!/bin/bash
-ls ${SHARED_DIR}/data/
+# ls ${USER_DIR}/MozillaDataset/Mozilla/batch1/wavs_no_processing
+yes | apt-get install sox
+soxi /data/rw/home/MozillaDataset/Mozilla/batch18/wavs_no_processing/18_167.wav
 pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
 yes | apt-get install espeak 
 python3 setup.py develop
-# python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/Blizzard/Nancy/  --restore_path ${USER_DIR}/best_model_4467.pth.tar
-python3 distribute.py --config_path config_cluster.json  --data_path ${USER_DIR}/Mozilla/
\ No newline at end of file
+# wget https://www.dropbox.com/s/evaouukiwb7krz8/MozillaDataset.tar.gz?dl=0 -O ${USER_DIR}/MozillaDataset.tar.gz
+# tar -xzvf ${USER_DIR}/MozillaDataset.tar.gz --no-same-owner -C ${USER_DIR}
+# python3 distribute.py --config_path config_cluster.json  --data_path ${USER_DIR}/MozillaDataset/Mozilla/  --restore_path ${USER_DIR}/best_model_4583.pth.tar
+python3 distribute.py --config_path config_cluster.json  --data_path ${USER_DIR}/MozillaDataset/Mozilla/
\ No newline at end of file
diff --git a/config_cluster.json b/config_cluster.json
index 59d9f52f..47e9569e 100644
--- a/config_cluster.json
+++ b/config_cluster.json
@@ -1,6 +1,6 @@
 {
     "run_name": "mozilla-fattn",
-    "run_description": "Finetune 4583, Mozilla with 0 batch group size and fattn",
+    "run_description": "Finetune 4583, Mozilla with 4 batch group size and fattn, batch size 16->24",
 
     "audio":{
         // Audio processing parameters
@@ -40,13 +40,13 @@
     "windowing": false,      // Enables attention windowing. Used only in eval mode.
     "memory_size": 5,       //  ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. 
     "attention_norm": "softmax",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
-    "prenet_type": "original",    // ONLY TACOTRON2 - "original" or "bn".
+    "prenet_type": "bn",    // ONLY TACOTRON2 - "original" or "bn".
     "use_forward_attn": true,    // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
     "transition_agent": false,    // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
     "loss_masking": false,       // enable / disable loss masking against the sequence padding.
     "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
 
-    "batch_size": 16,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
+    "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
     "eval_batch_size":16,   
     "r": 1,                 // Number of frames to predict for step.
     "wd": 0.000001,         // Weight decay weight.
@@ -54,10 +54,10 @@
     "save_step": 1000,      // Number of training steps expected to save traning stats and checkpoints.
     "print_step": 10,       // Number of steps to log traning on console.
     "tb_model_param_stats": true,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
+    "batch_group_size": 4,  //Number of batches to shuffle after bucketing.
 
     "run_eval": false,
-    "test_delay_epochs": 10,  //Until attention is aligned, testing only wastes computation time.
+    "test_delay_epochs": 5,  //Until attention is aligned, testing only wastes computation time.
     "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1",  // DATASET-RELATED: can overwritten from command argument
     "meta_file_train": "prompts_train.data",      // DATASET-RELATED: metafile for training dataloader.
     "meta_file_val": "prompts_val.data",    // DATASET-RELATED: metafile for evaluation dataloader.
@@ -65,10 +65,10 @@
     "min_seq_len": 0,       // DATASET-RELATED: minimum text length to use in training
     "max_seq_len": 150,     // DATASET-RELATED: maximum text length
     "output_path": "../keep/",      // DATASET-RELATED: output path for all training outputs.
-    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
-    "phoneme_cache_path": "nancy_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "num_loader_workers": 0,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 0,    // number of evaluation data loader processes.
+    "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
     "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
     "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
     "text_cleaner": "phoneme_cleaners"
-}
\ No newline at end of file
+}

From 01dbfb3a0f3adf22435cb160708cc179c5d0b4e9 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Thu, 18 Apr 2019 17:35:20 +0200
Subject: [PATCH 2/7] Server update s

---
 server/conf.json       |  3 ++-
 server/server.py       |  2 +-
 server/synthesizer.py  | 41 +++++++++++++++++++++++++++++++++++++++--
 utils/text/__init__.py |  1 +
 utils/text/symbols.py  |  4 ++++
 5 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/server/conf.json b/server/conf.json
index f1813073..ba8d5016 100644
--- a/server/conf.json
+++ b/server/conf.json
@@ -8,5 +8,6 @@
     "wavernn_config":"config.json", // wavernn config file
     "is_wavernn_batched":true, 
     "port": 5002,
-    "use_cuda": true
+    "use_cuda": true,
+    "debug": true
 }
diff --git a/server/server.py b/server/server.py
index f5ad4088..d7b1dca8 100644
--- a/server/server.py
+++ b/server/server.py
@@ -27,4 +27,4 @@ def tts():
 
 
 if __name__ == '__main__':
-    app.run(debug=True, host='0.0.0.0', port=config.port)
+    app.run(debug=config.debug, host='0.0.0.0', port=config.port)
diff --git a/server/synthesizer.py b/server/synthesizer.py
index b8198978..1f23cf25 100644
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@@ -10,6 +10,14 @@ from utils.audio import AudioProcessor
 from utils.generic_utils import load_config, setup_model
 from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme
 
+import re
+alphabets= "([A-Za-z])"
+prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
+suffixes = "(Inc|Ltd|Jr|Sr|Co)"
+starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
+acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
+websites = "[.](com|net|org|io|gov)"
+
 
 class Synthesizer(object):
     def __init__(self, config):
@@ -48,6 +56,7 @@ class Synthesizer(object):
         if use_cuda:
             self.tts_model.cuda()
         self.tts_model.eval()
+        self.tts_model.decoder.max_decoder_steps = 3000
 
     def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda):
         sys.path.append(lib_path) # set this if TTS is not installed globally
@@ -83,14 +92,42 @@ class Synthesizer(object):
         wav = np.array(wav)
         self.ap.save_wav(wav, path)
 
+    def split_into_sentences(self, text):
+        text = " " + text + "  "
+        text = text.replace("\n"," ")
+        text = re.sub(prefixes,"\\1<prd>",text)
+        text = re.sub(websites,"<prd>\\1",text)
+        if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
+        text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
+        text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
+        text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
+        text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
+        text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
+        text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
+        text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
+        if "”" in text: text = text.replace(".”","”.")
+        if "\"" in text: text = text.replace(".\"","\".")
+        if "!" in text: text = text.replace("!\"","\"!")
+        if "?" in text: text = text.replace("?\"","\"?")
+        text = text.replace(".",".<stop>")
+        text = text.replace("?","?<stop>")
+        text = text.replace("!","!<stop>")
+        text = text.replace("<prd>",".")
+        sentences = text.split("<stop>")
+        sentences = sentences[:-1]
+        sentences = [s.strip() for s in sentences]
+        return sentences
+
     def tts(self, text):
         wavs = []
-        for sen in text.split('.'):
+        sens = self.split_into_sentences(text)
+        if len(sens) == 0:
+            sens = [text+'.']
+        for sen in sens:
             if len(sen) < 3:
                 continue
             sen = sen.strip()
             print(sen)
-            sen = sen.strip()
 
             seq = np.array(self.input_adapter(sen))
             text_hat = sequence_to_phoneme(seq)
diff --git a/utils/text/__init__.py b/utils/text/__init__.py
index 9c0e3f47..a05f100a 100644
--- a/utils/text/__init__.py
+++ b/utils/text/__init__.py
@@ -50,6 +50,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False):
         sequence = [_phonemes_to_id['^']]
     else:
         sequence = []
+    text = text.replace(":", "")
     clean_text = _clean_text(text, cleaner_names)
     phonemes = text2phone(clean_text, language)
     if phonemes is None:
diff --git a/utils/text/symbols.py b/utils/text/symbols.py
index 5fc20a5f..838d84ae 100644
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@@ -28,6 +28,10 @@ _arpabet = ['@' + s for s in _phonemes]
 symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet
 phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations)
 
+# Generate ALIEN language
+# from random import shuffle
+# shuffle(phonemes)
+
 if __name__ == '__main__':
     print(" > TTS symbols ")
     print(symbols)

From 9ba13b2d2f299f505f28024f8fecfd0941df73f4 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Thu, 18 Apr 2019 18:36:01 +0200
Subject: [PATCH 3/7] fix forward attention

---
 layers/tacotron2.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/layers/tacotron2.py b/layers/tacotron2.py
index 871e577f..d7df0c9a 100644
--- a/layers/tacotron2.py
+++ b/layers/tacotron2.py
@@ -152,7 +152,7 @@ class Attention(nn.Module):
         """
         B = inputs.shape[0]
         T = inputs.shape[1]
-        self.alpha = torch.cat([torch.ones([B, 1]), torch.zeros([B, T])[:, :-1]], dim=1).to(inputs.device)
+        self.alpha = torch.cat([torch.ones([B, 1]), torch.zeros([B, T])[:, :-1] + 1e-7 ], dim=1).to(inputs.device)
         self.u = (0.5 * torch.ones([B, 1])).to(inputs.device)
 
     def get_attention(self, query, processed_inputs, attention_cat):
@@ -183,16 +183,16 @@ class Attention(nn.Module):
     def apply_forward_attention(self, inputs, alignment, processed_query):
         # forward attention
         prev_alpha = F.pad(self.alpha[:, :-1].clone(), (1, 0, 0, 0)).to(inputs.device)
-        self.alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha) + 1e-7) * alignment
-        alpha_norm = self.alpha / self.alpha.sum(dim=1).unsqueeze(1)
+        alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha)) * alignment
+        self.alpha = alpha / alpha.sum(dim=1).unsqueeze(1)
         # compute context
-        context = torch.bmm(alpha_norm.unsqueeze(1), inputs)
+        context = torch.bmm(self.alpha.unsqueeze(1), inputs)
         context = context.squeeze(1)
         # compute transition agent
         if self.trans_agent:
             ta_input = torch.cat([context, processed_query.squeeze(1)], dim=-1)
             self.u = torch.sigmoid(self.ta(ta_input))
-        return context, alpha_norm, alignment
+        return context, self.alpha, alignment
 
     def forward(self, attention_hidden_state, inputs, processed_inputs,
                 attention_cat, mask):

From 38213dffe901f7ac4a65304b11f8bb95e39c565a Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Thu, 18 Apr 2019 18:55:37 +0200
Subject: [PATCH 4/7] bug fix #2

---
 layers/tacotron2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/layers/tacotron2.py b/layers/tacotron2.py
index d7df0c9a..df05e5ad 100644
--- a/layers/tacotron2.py
+++ b/layers/tacotron2.py
@@ -183,7 +183,7 @@ class Attention(nn.Module):
     def apply_forward_attention(self, inputs, alignment, processed_query):
         # forward attention
         prev_alpha = F.pad(self.alpha[:, :-1].clone(), (1, 0, 0, 0)).to(inputs.device)
-        alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha)) * alignment
+        alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha) + 1e-8) * alignment
         self.alpha = alpha / alpha.sum(dim=1).unsqueeze(1)
         # compute context
         context = torch.bmm(self.alpha.unsqueeze(1), inputs)

From b0096728cba14cd5a292fb244c0058850ca520cd Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Fri, 19 Apr 2019 23:58:56 +0200
Subject: [PATCH 5/7] config update

---
 config_cluster.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config_cluster.json b/config_cluster.json
index 47e9569e..283b8be2 100644
--- a/config_cluster.json
+++ b/config_cluster.json
@@ -46,7 +46,7 @@
     "loss_masking": false,       // enable / disable loss masking against the sequence padding.
     "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
 
-    "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
+    "batch_size": 24,       // Batch size for training. Lower values than 32 might cause hard to learn attention.
     "eval_batch_size":16,   
     "r": 1,                 // Number of frames to predict for step.
     "wd": 0.000001,         // Weight decay weight.

From 6df6f2c036eccc369281dc4c848a85be9e23e58f Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Tue, 23 Apr 2019 14:58:00 +0200
Subject: [PATCH 6/7] config update

---
 .compute            |  7 ++++---
 config_cluster.json | 10 +++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.compute b/.compute
index 3e21a5bf..5ef7df1d 100644
--- a/.compute
+++ b/.compute
@@ -1,11 +1,12 @@
 #!/bin/bash
-# ls ${USER_DIR}/MozillaDataset/Mozilla/batch1/wavs_no_processing
+ls ${SHARED_DIR}/data/mozilla/Judy/
 yes | apt-get install sox
-soxi /data/rw/home/MozillaDataset/Mozilla/batch18/wavs_no_processing/18_167.wav
+yes | apt-get install ffmpeg
+soxi /data/ro/shared/data/mozilla/Judy/batch6/wavs_no_processing/6_126.wav
 pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl
 yes | apt-get install espeak 
 python3 setup.py develop
 # wget https://www.dropbox.com/s/evaouukiwb7krz8/MozillaDataset.tar.gz?dl=0 -O ${USER_DIR}/MozillaDataset.tar.gz
 # tar -xzvf ${USER_DIR}/MozillaDataset.tar.gz --no-same-owner -C ${USER_DIR}
 # python3 distribute.py --config_path config_cluster.json  --data_path ${USER_DIR}/MozillaDataset/Mozilla/  --restore_path ${USER_DIR}/best_model_4583.pth.tar
-python3 distribute.py --config_path config_cluster.json  --data_path ${USER_DIR}/MozillaDataset/Mozilla/
\ No newline at end of file
+python3 distribute.py --config_path config_cluster.json  --data_path ${SHARED_DIR}/data/mozilla/Judy/
diff --git a/config_cluster.json b/config_cluster.json
index 283b8be2..fe227a01 100644
--- a/config_cluster.json
+++ b/config_cluster.json
@@ -1,6 +1,6 @@
 {
-    "run_name": "mozilla-fattn",
-    "run_description": "Finetune 4583, Mozilla with 4 batch group size and fattn, batch size 16->24",
+    "run_name": "mozilla-nomask-fattn-bn",
+    "run_description": "Finetune 4700 orignal -> bn prenet - Mozilla with prenet bn, no mask, forward attn, batch group size 0",
 
     "audio":{
         // Audio processing parameters
@@ -54,7 +54,7 @@
     "save_step": 1000,      // Number of training steps expected to save traning stats and checkpoints.
     "print_step": 10,       // Number of steps to log traning on console.
     "tb_model_param_stats": true,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "batch_group_size": 4,  //Number of batches to shuffle after bucketing.
+    "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
 
     "run_eval": false,
     "test_delay_epochs": 5,  //Until attention is aligned, testing only wastes computation time.
@@ -65,8 +65,8 @@
     "min_seq_len": 0,       // DATASET-RELATED: minimum text length to use in training
     "max_seq_len": 150,     // DATASET-RELATED: maximum text length
     "output_path": "../keep/",      // DATASET-RELATED: output path for all training outputs.
-    "num_loader_workers": 0,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "num_val_loader_workers": 0,    // number of evaluation data loader processes.
+    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
     "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
     "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
     "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages

From add08e331b73ff6d5411b0e1bcd59582eaf68a47 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Wed, 24 Apr 2019 17:36:05 +0200
Subject: [PATCH 7/7] Define foldr name explicitly and mozilla preprocessor
 update

---
 datasets/preprocess.py |  8 +++++---
 train.py               | 10 +++++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/datasets/preprocess.py b/datasets/preprocess.py
index c1b8469a..9313e3e5 100644
--- a/datasets/preprocess.py
+++ b/datasets/preprocess.py
@@ -50,16 +50,18 @@ def mozilla(root_path, meta_files):
         items = []
         for idx, meta_file in enumerate(meta_files):
                 folder = folders[idx]
-                txt_file = os.path.join(root_path, meta_file)
+                # txt_file = os.path.join(root_path, meta_file)
+                txt_file = meta_file
                 with open(txt_file, 'r') as ttf:
                         for line in ttf:
                                 cols = line.split('|')
-                                wav_file = os.path.join(root_path, folder, 'wavs_no_processing', cols[1].strip())
+                                # wav_file = os.path.join(root_path, folder, 'wavs_no_processing', cols[1].strip())
+                                wav_file = os.path.join(folder, 'wavs_no_processing', cols[1].strip())
                                 if os.path.isfile(wav_file):
                                         text = cols[0].strip()
                                         items.append([text, wav_file])
                                 else: 
-                                        print(" > Error: {}".format(cols))
+                                        print(" > Error: {}".format(wav_file))
                                         continue
         random.shuffle(items)
         return items
diff --git a/train.py b/train.py
index 90427fbf..f9c80ebe 100644
--- a/train.py
+++ b/train.py
@@ -499,6 +499,12 @@ if __name__ == '__main__':
         type=str,
         help='path for training outputs.',
         default='')
+    parser.add_argument(
+        '--output_folder',
+        type=str,
+        default='',
+        help='folder name for traning outputs.'
+    )
 
     # DISTRUBUTED
     parser.add_argument(
@@ -524,8 +530,10 @@ if __name__ == '__main__':
     else:
         OUT_PATH = args.output_path
 
-    if args.group_id == '':
+    if args.group_id == '' and args.output_folder == '':
         OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug)
+    else:
+        OUT_PATH = os.path.join(OUT_PATH, args.output_folder)
 
     AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')