diff --git a/.compute b/.compute index 65a91011..8fa63c04 100644 --- a/.compute +++ b/.compute @@ -9,8 +9,8 @@ pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh sudo sh install.sh python3 setup.py develop -# cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ -cp -R ${USER_DIR}/GermanData ../tmp/ -python3 distribute.py --config_path config_tacotron.json --data_path ../tmp/GermanData/karlsson/ -# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/ +# cp -R ${USER_DIR}/GermanData ../tmp/ +# python3 distribute.py --config_path config_tacotron_de.json --data_path ../tmp/GermanData/karlsson/ +cp -R ${USER_DIR}/Mozilla_22050 ../tmp/ +python3 distribute.py --config_path config_tacotron.json --data_path ../tmp/Mozilla_22050/ --restore_path /data/rw/home/4845.pth.tar while true; do sleep 1000000; done diff --git a/config_tacotron.json b/config_tacotron.json index 968eae1e..61435ee7 100644 --- a/config_tacotron.json +++ b/config_tacotron.json @@ -1,6 +1,6 @@ { - "run_name": "mozilla-tacotron-tagent", - "run_description": "using forward attention with transition agent, with original prenet, loss masking, separate stopnet, sigmoid norm. Compare this with 4841", + "run_name": "mozilla-tacotron-tagent-bn", + "run_description": "finetune 4845 with bn prenet.", "audio":{ // Audio processing parameters @@ -40,7 +40,7 @@ "windowing": false, // Enables attention windowing. Used only in eval mode. "memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. "attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron. - "prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn". + "prenet_type": "bn", // ONLY TACOTRON2 - "original" or "bn". "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. "transition_agent": true, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. @@ -62,6 +62,7 @@ "run_eval": true, "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, "data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument "meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader. "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader. diff --git a/config_tacotron_de.json b/config_tacotron_de.json index 7f221c64..bf80bcc6 100644 --- a/config_tacotron_de.json +++ b/config_tacotron_de.json @@ -1,6 +1,6 @@ { - "run_name": "german-tacotron-tagent", - "run_description": "using forward attention with transition agent, with original prenet, loss masking, separate stopnet, sigmoid norm. First run German data.", + "run_name": "german-tacotron-tagent-bn", + "run_description": "train german", "audio":{ // Audio processing parameters @@ -65,25 +65,16 @@ "test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time. "data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument "meta_file_train": [ - "kleinzaches/metadata.csv", - "spiegel_kaetzchen/metadata.csv", - "herrnarnesschatz/metadata.csv", - "maedchen_von_moorhof/metadata.csv", - "koenigsgaukler/metadata.csv", - "altehous/metadata.csv", - "odysseus/metadata.csv", - "undine/metadata.csv", - "reise_tilsit/metadata.csv", - "schmied_seines_glueckes/metadata.csv", - "kammmacher/metadata.csv", - "unterm_birnbaum/metadata.csv", - "liebesbriefe/metadata.csv", - "sandmann/metadata.csv"], // DATASET-RELATED: metafile for training dataloader. + "grune_haus/metadata.csv", + "kleine_lord/metadata.csv", + "toten_seelen/metadata.csv", + "werde_die_du_bist/metadata.csv" + ], // DATASET-RELATED: metafile for training dataloader. "meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader. "dataset": "mailabs", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training - "max_seq_len": 150, // DATASET-RELATED: maximum text length - "output_path": "../keep/", // DATASET-RELATED: output path for all training outputs. + "max_seq_len": 200, // DATASET-RELATED: maximum text length + "output_path": "/media/erogol/data_ssd/Data/models/german/", // DATASET-RELATED: output path for all training outputs. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_val_loader_workers": 4, // number of evaluation data loader processes. "phoneme_cache_path": "phoneme_cache", // phoneme computation is slow, therefore, it caches results in the given folder. diff --git a/datasets/preprocess.py b/datasets/preprocess.py index f8e7e63e..07c025df 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -1,4 +1,5 @@ import os +from glob import glob def tweb(root_path, meta_file): @@ -60,6 +61,8 @@ def mozilla(root_path, meta_file): def mailabs(root_path, meta_files): """Normalizes M-AI-Labs meta data files to TTS format""" + if meta_files is None: + meta_files = glob(root_path+"/**/metadata.csv", recursive=True) folders = [os.path.dirname(f.strip()) for f in meta_files] # meta_files = [f.strip() for f in meta_files.split(",")] items = [] diff --git a/utils/audio.py b/utils/audio.py index 1a5c52a3..0feb3653 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -243,7 +243,7 @@ class AudioProcessor(object): if self.do_trim_silence: x = self.trim_silence(x) # sr, x = io.wavfile.read(filename) - assert self.sample_rate == sr + assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr) return x def encode_16bits(self, x):