config updates, update audio.py, update mailabs preprocessor

This commit is contained in:
Eren Golge 2019-06-03 15:34:36 +02:00
parent f69127ff88
commit f096f1052f
5 changed files with 21 additions and 26 deletions

View File

@ -9,8 +9,8 @@ pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux
wget https://www.dropbox.com/s/wqn5v3wkktw9lmo/install.sh?dl=0 -O install.sh
sudo sh install.sh
python3 setup.py develop
# cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
cp -R ${USER_DIR}/GermanData ../tmp/
python3 distribute.py --config_path config_tacotron.json --data_path ../tmp/GermanData/karlsson/
# python3 distribute.py --config_path config_cluster.json --data_path ${SHARED_DIR}/data/mozilla/Judy/
# cp -R ${USER_DIR}/GermanData ../tmp/
# python3 distribute.py --config_path config_tacotron_de.json --data_path ../tmp/GermanData/karlsson/
cp -R ${USER_DIR}/Mozilla_22050 ../tmp/
python3 distribute.py --config_path config_tacotron.json --data_path ../tmp/Mozilla_22050/ --restore_path /data/rw/home/4845.pth.tar
while true; do sleep 1000000; done

View File

@ -1,6 +1,6 @@
{
"run_name": "mozilla-tacotron-tagent",
"run_description": "using forward attention with transition agent, with original prenet, loss masking, separate stopnet, sigmoid norm. Compare this with 4841",
"run_name": "mozilla-tacotron-tagent-bn",
"run_description": "finetune 4845 with bn prenet.",
"audio":{
// Audio processing parameters
@ -40,7 +40,7 @@
"windowing": false, // Enables attention windowing. Used only in eval mode.
"memory_size": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5.
"attention_norm": "sigmoid", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.
"prenet_type": "original", // ONLY TACOTRON2 - "original" or "bn".
"prenet_type": "bn", // ONLY TACOTRON2 - "original" or "bn".
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
"transition_agent": true, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
@ -62,6 +62,7 @@
"run_eval": true,
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
"test_sentences_file": null,
"data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
"meta_file_train": "metadata_train.txt", // DATASET-RELATED: metafile for training dataloader.
"meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.

View File

@ -1,6 +1,6 @@
{
"run_name": "german-tacotron-tagent",
"run_description": "using forward attention with transition agent, with original prenet, loss masking, separate stopnet, sigmoid norm. First run German data.",
"run_name": "german-tacotron-tagent-bn",
"run_description": "train german",
"audio":{
// Audio processing parameters
@ -65,25 +65,16 @@
"test_delay_epochs": 5, //Until attention is aligned, testing only wastes computation time.
"data_path": "/media/erogol/data_ssd/Data/Mozilla/", // DATASET-RELATED: can overwritten from command argument
"meta_file_train": [
"kleinzaches/metadata.csv",
"spiegel_kaetzchen/metadata.csv",
"herrnarnesschatz/metadata.csv",
"maedchen_von_moorhof/metadata.csv",
"koenigsgaukler/metadata.csv",
"altehous/metadata.csv",
"odysseus/metadata.csv",
"undine/metadata.csv",
"reise_tilsit/metadata.csv",
"schmied_seines_glueckes/metadata.csv",
"kammmacher/metadata.csv",
"unterm_birnbaum/metadata.csv",
"liebesbriefe/metadata.csv",
"sandmann/metadata.csv"], // DATASET-RELATED: metafile for training dataloader.
"grune_haus/metadata.csv",
"kleine_lord/metadata.csv",
"toten_seelen/metadata.csv",
"werde_die_du_bist/metadata.csv"
], // DATASET-RELATED: metafile for training dataloader.
"meta_file_val": "metadata_val.txt", // DATASET-RELATED: metafile for evaluation dataloader.
"dataset": "mailabs", // DATASET-RELATED: one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py
"min_seq_len": 0, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 150, // DATASET-RELATED: maximum text length
"output_path": "../keep/", // DATASET-RELATED: output path for all training outputs.
"max_seq_len": 200, // DATASET-RELATED: maximum text length
"output_path": "/media/erogol/data_ssd/Data/models/german/", // DATASET-RELATED: output path for all training outputs.
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"phoneme_cache_path": "phoneme_cache", // phoneme computation is slow, therefore, it caches results in the given folder.

View File

@ -1,4 +1,5 @@
import os
from glob import glob
def tweb(root_path, meta_file):
@ -60,6 +61,8 @@ def mozilla(root_path, meta_file):
def mailabs(root_path, meta_files):
"""Normalizes M-AI-Labs meta data files to TTS format"""
if meta_files is None:
meta_files = glob(root_path+"/**/metadata.csv", recursive=True)
folders = [os.path.dirname(f.strip()) for f in meta_files]
# meta_files = [f.strip() for f in meta_files.split(",")]
items = []

View File

@ -243,7 +243,7 @@ class AudioProcessor(object):
if self.do_trim_silence:
x = self.trim_silence(x)
# sr, x = io.wavfile.read(filename)
assert self.sample_rate == sr
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
return x
def encode_16bits(self, x):