From 5754116c19f8cabdaa0b55cc1d134d82301c3910 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 6 Mar 2019 22:06:01 +0100 Subject: [PATCH 1/2] bos char addded --- .compute | 4 ++-- config.json | 9 ++++---- config_cluster.json | 52 +++++++++++++++++++++++------------------- distribute.py | 2 +- train.py | 2 +- utils/text/__init__.py | 4 ++-- utils/text/symbols.py | 5 ++-- 7 files changed, 42 insertions(+), 36 deletions(-) diff --git a/.compute b/.compute index 99e7fad2..8e2ec00e 100644 --- a/.compute +++ b/.compute @@ -3,5 +3,5 @@ ls ${SHARED_DIR}/data/keithito pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp36-cp36m-linux_x86_64.whl yes | apt-get install espeak python3 setup.py develop -# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/ --restore_path ${USER_DIR}/best_model.pth.tar -python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/ +python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/ --restore_path ${USER_DIR}/best_model.pth.tar +# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/keithito/LJSpeech-1.1/ diff --git a/config.json b/config.json index b4037a1d..138d3436 100644 --- a/config.json +++ b/config.json @@ -1,6 +1,6 @@ { - "run_name": "queue", - "run_description": "Queue memory and change lower r incrementatlly", + "run_name": "bos", + "run_description": "bos character added to get away with the first char miss", "audio":{ // Audio processing parameters @@ -29,7 +29,7 @@ "url": "tcp:\/\/localhost:54321" }, - "model": "Tacotron", // one of the model in models/ + "model": "Tacotron2", // one of the model in models/ "grad_clip": 0.02, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. @@ -37,12 +37,11 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // TO BE IMPLEMENTED -- memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. - "batch_group_size": 3, "batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. "eval_batch_size":16, "r": 1, // Number of frames to predict for step. - "wd": 0.000005, // Weight decay weight. + "wd": 0.000002, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 10, // Number of steps to log traning on console. diff --git a/config_cluster.json b/config_cluster.json index efe53c8a..818036d7 100644 --- a/config_cluster.json +++ b/config_cluster.json @@ -1,9 +1,8 @@ { - "model_name": "tts-master", - "model_description": "tts master cluster test", + "run_name": "bos", + "run_description": "bos character added to get away with the first char miss", "audio":{ - "audio_processor": "audio", // to use dictate different audio processors, if available. // Audio processing parameters "num_mels": 80, // size of the mel spec frame. "num_freq": 1025, // number of stft frequency levels. Size of the linear spectogram frame. @@ -20,41 +19,48 @@ "symmetric_norm": false, // move normalization to range [-1, 1] "max_norm": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "clip_norm": true, // clip normalized values into the range. - "mel_fmin": null, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": null, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) }, - "embedding_size": 256, // Character embedding vector length. You don't need to change it in general. - "text_cleaner": "phoneme_cleaners", + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54321" + }, + + "model": "Tacotron2", // one of the model in models/ + "grad_clip": 0.05, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "loss_weight": 0.0, // loss weight to emphasize lower frequencies. Lower frequencies are in general more important for speech signals. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "windowing": false, // Enables attention windowing. Used only in eval mode. - "memory_size": 5, // memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. + "memory_size": 5, // TO BE IMPLEMENTED -- memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. - "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. - "eval_batch_size":32, - "r": 5, // Number of frames to predict for step. - "wd": 0.00001, // Weight decay weight. + "batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. + "eval_batch_size":16, + "r": 1, // Number of frames to predict for step. + "wd": 0.000002, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 5000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 50, // Number of steps to log traning on console. + "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. + "print_step": 10, // Number of steps to log traning on console. "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "batch_group_size": 8, //Number of batches to shuffle after bucketing. "run_eval": true, + "test_delay_epochs": 100, //Until attention is aligned, testing only wastes computation time. "data_path": "/media/erogol/data_ssd/Data/LJSpeech-1.1", // DATASET-RELATED: can overwritten from command argument - "meta_file_train": "prompts_train.data", // DATASET-RELATED: metafile for training dataloader. - "meta_file_val": "prompts_val.data", // DATASET-RELATED: metafile for evaluation dataloader. - "dataset": "nancy", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py + "meta_file_train": "metadata_train.csv", // DATASET-RELATED: metafile for training dataloader. + "meta_file_val": "metadata_val.csv", // DATASET-RELATED: metafile for evaluation dataloader. + "dataset": "ljspeech", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 300, // DATASET-RELATED: maximum text length - "output_path": "models/", // DATASET-RELATED: output path for all training outputs. + "max_seq_len": 1000, // DATASET-RELATED: maximum text length + "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. - "phoneme_cache_path": "phonemes_cache", // phoneme computation is slow, therefore, it caches results in the given folder. + "phoneme_cache_path": "ljspeech_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. - "phoneme_language": "en-us" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + "text_cleaner": "phoneme_cleaners" } diff --git a/distribute.py b/distribute.py index f4538cfd..c2f786fe 100644 --- a/distribute.py +++ b/distribute.py @@ -131,7 +131,7 @@ def main(args): Call train.py as a new process and pass command arguments """ CONFIG = load_config(args.config_path) - OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.model_name, + OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name, True) stdout_path = os.path.join(OUT_PATH, "process_stdout/") diff --git a/train.py b/train.py index a0ee76c1..5aed4073 100644 --- a/train.py +++ b/train.py @@ -425,7 +425,7 @@ def main(args): print( " > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['epoch'] - best_loss = checkpoint['postnet_loss'] + # best_loss = checkpoint['postnet_loss'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 76993d50..c66bce91 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -43,7 +43,7 @@ def phoneme_to_sequence(text, cleaner_names, language): ''' TODO: This ignores punctuations ''' - sequence = [] + sequence = [_phonemes_to_id['^']] clean_text = _clean_text(text, cleaner_names) phonemes = text2phone(clean_text, language) # print(phonemes.replace('|', '')) @@ -81,7 +81,7 @@ def text_to_sequence(text, cleaner_names): List of integers corresponding to the symbols in the text ''' sequence = [] - + sequence = [_phonemes_to_id['^']] # Check for curly braces and treat their contents as ARPAbet: while len(text): m = _curly_re.match(text) diff --git a/utils/text/symbols.py b/utils/text/symbols.py index a25f7c99..5fc20a5f 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -9,6 +9,7 @@ from utils.text import cmudict _pad = '_' _eos = '~' +_bos = '^' _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' _punctuations = '!\'(),-.:;? ' _phoneme_punctuations = '.!;:,?' @@ -24,8 +25,8 @@ _phonemes = sorted(list(set(_phonemes))) _arpabet = ['@' + s for s in _phonemes] # Export all symbols: -symbols = [_pad, _eos] + list(_characters) + _arpabet -phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations) +symbols = [_pad, _eos, _bos] + list(_characters) + _arpabet +phonemes = [_pad, _eos, _bos] + list(_phonemes) + list(_punctuations) if __name__ == '__main__': print(" > TTS symbols ") From 772c859dffa2fe60f0a4d2fb7778be5344646e84 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 7 Mar 2019 11:44:17 +0100 Subject: [PATCH 2/2] Chnage stop condition --- layers/tacotron2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 296ea7ec..e743186f 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -355,7 +355,7 @@ class Decoder(nn.Module): alignments += [alignment] stop_flags[0] = stop_flags[0] or gate_output > 0.5 - stop_flags[1] = stop_flags[1] or alignment[0, -3:].sum() > 0.5 + stop_flags[1] = stop_flags[1] or alignment[0, -2:].sum() > 0.5 if all(stop_flags): break elif len(outputs) == self.max_decoder_steps: