From 9915220539af1c58dac2d1a93b25caeb8d898751 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 18 Apr 2019 17:34:54 +0200 Subject: [PATCH 1/7] config upate --- .compute | 10 +++++++--- config_cluster.json | 18 +++++++++--------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/.compute b/.compute index c559bcab..3e21a5bf 100644 --- a/.compute +++ b/.compute @@ -1,7 +1,11 @@ #!/bin/bash -ls ${SHARED_DIR}/data/ +# ls ${USER_DIR}/MozillaDataset/Mozilla/batch1/wavs_no_processing +yes | apt-get install sox +soxi /data/rw/home/MozillaDataset/Mozilla/batch18/wavs_no_processing/18_167.wav pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl yes | apt-get install espeak python3 setup.py develop -# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/Blizzard/Nancy/ --restore_path ${USER_DIR}/best_model_4467.pth.tar -python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/Mozilla/ \ No newline at end of file +# wget https://www.dropbox.com/s/evaouukiwb7krz8/MozillaDataset.tar.gz?dl=0 -O ${USER_DIR}/MozillaDataset.tar.gz +# tar -xzvf ${USER_DIR}/MozillaDataset.tar.gz --no-same-owner -C ${USER_DIR} +# python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/MozillaDataset/Mozilla/ --restore_path ${USER_DIR}/best_model_4583.pth.tar +python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/MozillaDataset/Mozilla/ \ No newline at end of file diff --git a/config_cluster.json b/config_cluster.json index 59d9f52f..47e9569e 100644 --- a/config_cluster.json +++ b/config_cluster.json @@ -1,6 +1,6 @@ { "run_name": "mozilla-fattn", - "run_description": "Finetune 4583, Mozilla with 0 batch group size and fattn", + "run_description": "Finetune 4583, Mozilla with 4 batch group size and fattn, batch size 16->24", "audio":{ // Audio processing parameters @@ -40,13 +40,13 @@ "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "softmax", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". + "prenet_type": "bn", // ONLY TACOTRON2 - "original" or "bn". "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. "loss_masking": false, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. "eval_batch_size":16, "r": 1, // Number of frames to predict for step. "wd": 0.000001, // Weight decay weight. @@ -54,10 +54,10 @@ "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 10, // Number of steps to log traning on console. "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "batch_group_size": 4, //Number of batches to shuffle after bucketing. "run_eval": false, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. + "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument "meta_file_train": "prompts_train.data", // DATASET-RELATED: metafile for training dataloader. "meta_file_val": "prompts_val.data", // DATASET-RELATED: metafile for evaluation dataloader. @@ -65,10 +65,10 @@ "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "phoneme_cache_path": "nancy_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. + "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 0, // number of evaluation data loader processes. + "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages "text_cleaner": "phoneme_cleaners" -} \ No newline at end of file +} From 01dbfb3a0f3adf22435cb160708cc179c5d0b4e9 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 18 Apr 2019 17:35:20 +0200 Subject: [PATCH 2/7] Server update s --- server/conf.json | 3 ++- server/server.py | 2 +- server/synthesizer.py | 41 +++++++++++++++++++++++++++++++++++++++-- utils/text/__init__.py | 1 + utils/text/symbols.py | 4 ++++ 5 files changed, 47 insertions(+), 4 deletions(-) diff --git a/server/conf.json b/server/conf.json index f1813073..ba8d5016 100644 --- a/server/conf.json +++ b/server/conf.json @@ -8,5 +8,6 @@ "wavernn_config":"config.json", // wavernn config file "is_wavernn_batched":true, "port": 5002, - "use_cuda": true + "use_cuda": true, + "debug": true } diff --git a/server/server.py b/server/server.py index f5ad4088..d7b1dca8 100644 --- a/server/server.py +++ b/server/server.py @@ -27,4 +27,4 @@ def tts(): if __name__ == '__main__': - app.run(debug=True, host='0.0.0.0', port=config.port) + app.run(debug=config.debug, host='0.0.0.0', port=config.port) diff --git a/server/synthesizer.py b/server/synthesizer.py index b8198978..1f23cf25 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -10,6 +10,14 @@ from utils.audio import AudioProcessor from utils.generic_utils import load_config, setup_model from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme +import re +alphabets= "([A-Za-z])" +prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" +suffixes = "(Inc|Ltd|Jr|Sr|Co)" +starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" +acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" +websites = "[.](com|net|org|io|gov)" + class Synthesizer(object): def __init__(self, config): @@ -48,6 +56,7 @@ class Synthesizer(object): if use_cuda: self.tts_model.cuda() self.tts_model.eval() + self.tts_model.decoder.max_decoder_steps = 3000 def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): sys.path.append(lib_path) # set this if TTS is not installed globally @@ -83,14 +92,42 @@ class Synthesizer(object): wav = np.array(wav) self.ap.save_wav(wav, path) + def split_into_sentences(self, text): + text = " " + text + " " + text = text.replace("\n"," ") + text = re.sub(prefixes,"\\1",text) + text = re.sub(websites,"\\1",text) + if "Ph.D" in text: text = text.replace("Ph.D.","PhD") + text = re.sub("\s" + alphabets + "[.] "," \\1 ",text) + text = re.sub(acronyms+" "+starters,"\\1 \\2",text) + text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1\\2\\3",text) + text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1\\2",text) + text = re.sub(" "+suffixes+"[.] "+starters," \\1 \\2",text) + text = re.sub(" "+suffixes+"[.]"," \\1",text) + text = re.sub(" " + alphabets + "[.]"," \\1",text) + if "”" in text: text = text.replace(".”","”.") + if "\"" in text: text = text.replace(".\"","\".") + if "!" in text: text = text.replace("!\"","\"!") + if "?" in text: text = text.replace("?\"","\"?") + text = text.replace(".",".") + text = text.replace("?","?") + text = text.replace("!","!") + text = text.replace("",".") + sentences = text.split("") + sentences = sentences[:-1] + sentences = [s.strip() for s in sentences] + return sentences + def tts(self, text): wavs = [] - for sen in text.split('.'): + sens = self.split_into_sentences(text) + if len(sens) == 0: + sens = [text+'.'] + for sen in sens: if len(sen) < 3: continue sen = sen.strip() print(sen) - sen = sen.strip() seq = np.array(self.input_adapter(sen)) text_hat = sequence_to_phoneme(seq) diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 9c0e3f47..a05f100a 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -50,6 +50,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False): sequence = [_phonemes_to_id['^']] else: sequence = [] + text = text.replace(":", "") clean_text = _clean_text(text, cleaner_names) phonemes = text2phone(clean_text, language) if phonemes is None: diff --git a/utils/text/symbols.py b/utils/text/symbols.py index 5fc20a5f..838d84ae 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -28,6 +28,10 @@ _arpabet = ['@' + s for s in _phonemes] symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations) +# Generate ALIEN language +# from random import shuffle +# shuffle(phonemes) + if __name__ == '__main__': print(" > TTS symbols ") print(symbols) From 9ba13b2d2f299f505f28024f8fecfd0941df73f4 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 18 Apr 2019 18:36:01 +0200 Subject: [PATCH 3/7] fix forward attention --- layers/tacotron2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 871e577f..d7df0c9a 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -152,7 +152,7 @@ class Attention(nn.Module): """ B = inputs.shape[0] T = inputs.shape[1] - self.alpha = torch.cat([torch.ones([B, 1]), torch.zeros([B, T])[:, :-1]], dim=1).to(inputs.device) + self.alpha = torch.cat([torch.ones([B, 1]), torch.zeros([B, T])[:, :-1] + 1e-7 ], dim=1).to(inputs.device) self.u = (0.5 * torch.ones([B, 1])).to(inputs.device) def get_attention(self, query, processed_inputs, attention_cat): @@ -183,16 +183,16 @@ class Attention(nn.Module): def apply_forward_attention(self, inputs, alignment, processed_query): # forward attention prev_alpha = F.pad(self.alpha[:, :-1].clone(), (1, 0, 0, 0)).to(inputs.device) - self.alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha) + 1e-7) * alignment - alpha_norm = self.alpha / self.alpha.sum(dim=1).unsqueeze(1) + alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha)) * alignment + self.alpha = alpha / alpha.sum(dim=1).unsqueeze(1) # compute context - context = torch.bmm(alpha_norm.unsqueeze(1), inputs) + context = torch.bmm(self.alpha.unsqueeze(1), inputs) context = context.squeeze(1) # compute transition agent if self.trans_agent: ta_input = torch.cat([context, processed_query.squeeze(1)], dim=-1) self.u = torch.sigmoid(self.ta(ta_input)) - return context, alpha_norm, alignment + return context, self.alpha, alignment def forward(self, attention_hidden_state, inputs, processed_inputs, attention_cat, mask): From 38213dffe901f7ac4a65304b11f8bb95e39c565a Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 18 Apr 2019 18:55:37 +0200 Subject: [PATCH 4/7] bug fix #2 --- layers/tacotron2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index d7df0c9a..df05e5ad 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -183,7 +183,7 @@ class Attention(nn.Module): def apply_forward_attention(self, inputs, alignment, processed_query): # forward attention prev_alpha = F.pad(self.alpha[:, :-1].clone(), (1, 0, 0, 0)).to(inputs.device) - alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha)) * alignment + alpha = (((1-self.u) * self.alpha.clone().to(inputs.device) + self.u * prev_alpha) + 1e-8) * alignment self.alpha = alpha / alpha.sum(dim=1).unsqueeze(1) # compute context context = torch.bmm(self.alpha.unsqueeze(1), inputs) From b0096728cba14cd5a292fb244c0058850ca520cd Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 19 Apr 2019 23:58:56 +0200 Subject: [PATCH 5/7] config update --- config_cluster.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config_cluster.json b/config_cluster.json index 47e9569e..283b8be2 100644 --- a/config_cluster.json +++ b/config_cluster.json @@ -46,7 +46,7 @@ "loss_masking": false, // enable / disable loss masking against the sequence padding. "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. - "batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "batch_size": 24, // Batch size for training. Lower values than 32 might cause hard to learn attention. "eval_batch_size":16, "r": 1, // Number of frames to predict for step. "wd": 0.000001, // Weight decay weight. From 6df6f2c036eccc369281dc4c848a85be9e23e58f Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 23 Apr 2019 14:58:00 +0200 Subject: [PATCH 6/7] config update --- .compute | 7 ++++--- config_cluster.json | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.compute b/.compute index 3e21a5bf..5ef7df1d 100644 --- a/.compute +++ b/.compute @@ -1,11 +1,12 @@ #!/bin/bash -# ls ${USER_DIR}/MozillaDataset/Mozilla/batch1/wavs_no_processing +ls ${SHARED_DIR}/data/mozilla/Judy/ yes | apt-get install sox -soxi /data/rw/home/MozillaDataset/Mozilla/batch18/wavs_no_processing/18_167.wav +yes | apt-get install ffmpeg +soxi /data/ro/shared/data/mozilla/Judy/batch6/wavs_no_processing/6_126.wav pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl yes | apt-get install espeak python3 setup.py develop # wget https://www.dropbox.com/s/evaouukiwb7krz8/MozillaDataset.tar.gz?dl=0 -O ${USER_DIR}/MozillaDataset.tar.gz # tar -xzvf ${USER_DIR}/MozillaDataset.tar.gz --no-same-owner -C ${USER_DIR} # python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/MozillaDataset/Mozilla/ --restore_path ${USER_DIR}/best_model_4583.pth.tar -python3 distribute.py --config_path config_cluster.json --data_path ${USER_DIR}/MozillaDataset/Mozilla/ \ No newline at end of file +python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/ diff --git a/config_cluster.json b/config_cluster.json index 283b8be2..fe227a01 100644 --- a/config_cluster.json +++ b/config_cluster.json @@ -1,6 +1,6 @@ { - "run_name": "mozilla-fattn", - "run_description": "Finetune 4583, Mozilla with 4 batch group size and fattn, batch size 16->24", + "run_name": "mozilla-nomask-fattn-bn", + "run_description": "Finetune 4700 orignal -> bn prenet - Mozilla with prenet bn, no mask, forward attn, batch group size 0", "audio":{ // Audio processing parameters @@ -54,7 +54,7 @@ "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 10, // Number of steps to log traning on console. "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "batch_group_size": 4, //Number of batches to shuffle after bucketing. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. "run_eval": false, "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. @@ -65,8 +65,8 @@ "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training "max_seq_len": 150, // DATASET-RELATED: maximum text length "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. - "num_loader_workers": 0, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 0, // number of evaluation data loader processes. + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages From add08e331b73ff6d5411b0e1bcd59582eaf68a47 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 24 Apr 2019 17:36:05 +0200 Subject: [PATCH 7/7] Define foldr name explicitly and mozilla preprocessor update --- datasets/preprocess.py | 8 +++++--- train.py | 10 +++++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index c1b8469a..9313e3e5 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -50,16 +50,18 @@ def mozilla(root_path, meta_files): items = [] for idx, meta_file in enumerate(meta_files): folder = folders[idx] - txt_file = os.path.join(root_path, meta_file) + # txt_file = os.path.join(root_path, meta_file) + txt_file = meta_file with open(txt_file, 'r') as ttf: for line in ttf: cols = line.split('|') - wav_file = os.path.join(root_path, folder, 'wavs_no_processing', cols[1].strip()) + # wav_file = os.path.join(root_path, folder, 'wavs_no_processing', cols[1].strip()) + wav_file = os.path.join(folder, 'wavs_no_processing', cols[1].strip()) if os.path.isfile(wav_file): text = cols[0].strip() items.append([text, wav_file]) else: - print(" > Error: {}".format(cols)) + print(" > Error: {}".format(wav_file)) continue random.shuffle(items) return items diff --git a/train.py b/train.py index 90427fbf..f9c80ebe 100644 --- a/train.py +++ b/train.py @@ -499,6 +499,12 @@ if __name__ == '__main__': type=str, help='path for training outputs.', default='') + parser.add_argument( + '--output_folder', + type=str, + default='', + help='folder name for traning outputs.' + ) # DISTRUBUTED parser.add_argument( @@ -524,8 +530,10 @@ if __name__ == '__main__': else: OUT_PATH = args.output_path - if args.group_id == '': + if args.group_id == '' and args.output_folder == '': OUT_PATH = create_experiment_folder(OUT_PATH, c.run_name, args.debug) + else: + OUT_PATH = os.path.join(OUT_PATH, args.output_folder) AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios')