adding more tests and refactoring

2020-11-09 13:30:42 +01:00 · 2020-11-09 13:30:42 +01:00 · 116e2299b0
parent c76a617072
commit 116e2299b0
7 changed files with 614 additions and 1 deletions
--- a/notebooks/PlotUmapLibriTTS.ipynb
+++ b/notebooks/PlotUmapLibriTTS.ipynb
@ -0,0 +1,325 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Overview\n",
+    "\n",
+    "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n",
+    "\n",
+    "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import glob\n",
+    "import random\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import umap\n",
+    "\n",
+    "from TTS.speaker_encoder.model import SpeakerEncoder\n",
+    "from TTS.tts.utils.audio import AudioProcessor\n",
+    "from TTS.tts.utils.generic_utils import load_config\n",
+    "\n",
+    "from bokeh.io import output_notebook, show\n",
+    "from bokeh.plotting import figure\n",
+    "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
+    "from bokeh.transform import factor_cmap, factor_mark\n",
+    "from bokeh.palettes import Category10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n",
+    "\n",
+    "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n",
+    "\n",
+    "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_notebook()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should also adjust all the path constants to point at the relevant locations for you locally"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
+    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
+    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
+    "\n",
+    "# My single speaker locations\n",
+    "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n",
+    "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n",
+    "\n",
+    "# My multi speaker locations\n",
+    "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n",
+    "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -1 $MODEL_RUN_PATH"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CONFIG = load_config(CONFIG_PATH)\n",
+    "ap = AudioProcessor(**CONFIG['audio'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Bring in the embeddings created by **compute_embeddings.py**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n",
+    "print(f'Embeddings found: {len(embed_files)}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Check that we did indeed find an embedding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embed_files[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Process the speakers\n",
+    "\n",
+    "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n",
+    "speaker_to_utter = {}\n",
+    "for embed_file in embed_files:\n",
+    "    speaker_path = os.path.dirname(os.path.dirname(embed_file))\n",
+    "    try:\n",
+    "        speaker_to_utter[speaker_path].append(embed_file)\n",
+    "    except:\n",
+    "        speaker_to_utter[speaker_path]=[embed_file]\n",
+    "print(f'Speaker count: {len(speaker_paths)}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set up the embeddings\n",
+    "\n",
+    "Adjust the number of speakers to select and the number of utterances from each speaker and they will be randomly sampled from the corpus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeds = []\n",
+    "labels = []\n",
+    "locations = []\n",
+    "\n",
+    "# single speaker \n",
+    "#num_speakers = 1\n",
+    "#num_utters = 1000\n",
+    "\n",
+    "# multi speaker\n",
+    "num_speakers = 10\n",
+    "num_utters = 20\n",
+    "\n",
+    "\n",
+    "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n",
+    "\n",
+    "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n",
+    "    speaker_path = speaker_paths[speaker_idx]\n",
+    "    speakers_utter = speaker_to_utter[speaker_path]\n",
+    "    utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n",
+    "    for utter_idx in utter_idxs:\n",
+    "            embed_path = speaker_to_utter[speaker_path][utter_idx]\n",
+    "            embed = np.load(embed_path)\n",
+    "            embeds.append(embed)\n",
+    "            labels.append(str(speaker_num))\n",
+    "            locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n",
+    "embeds = np.concatenate(embeds)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load embeddings with UMAP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = umap.UMAP()\n",
+    "projection = model.fit_transform(embeds)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Interactively charting the data in Bokeh\n",
+    "\n",
+    "Set up various details for Bokeh to plot the data\n",
+    "\n",
+    "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n",
+    "\n",
+    "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n",
+    "\n",
+    "File location in the tooltip is given relative to **AUDIO_PATH**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "source_wav_stems = ColumnDataSource(\n",
+    "        data=dict(\n",
+    "            x = projection.T[0].tolist(),\n",
+    "            y = projection.T[1].tolist(),\n",
+    "            desc=locations,\n",
+    "            label=labels\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "hover = HoverTool(\n",
+    "        tooltips=[\n",
+    "            (\"file\", \"@desc\"),\n",
+    "            (\"speaker\", \"@label\"),\n",
+    "        ]\n",
+    "    )\n",
+    "\n",
+    "# optionally consider adding these to the tooltips if you want additional detail\n",
+    "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n",
+    "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n",
+    "\n",
+    "factors = list(set(labels))\n",
+    "pal_size = max(len(factors), 3)\n",
+    "pal = Category10[pal_size]\n",
+    "\n",
+    "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n",
+    "\n",
+    "\n",
+    "p.circle('x', 'y',  source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n",
+    "\n",
+    "url = \"http://localhost:8000/@desc\"\n",
+    "taptool = p.select(type=TapTool)\n",
+    "taptool.callback = OpenURL(url=url)\n",
+    "\n",
+    "show(p)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Local server to serve wav files from corpus\n",
+    "\n",
+    "This is required so that when you click on a data point the hyperlink associated with it will be served the file locally.\n",
+    "\n",
+    "There are other ways to serve this if you prefer and you can also run the commands manually on the command line\n",
+    "\n",
+    "The server will continue to run until stopped. To stop it simply interupt the kernel (ie square button or under Kernel menu)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd $AUDIO_PATH\n",
+    "%pwd\n",
+    "!python -m http.server"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/run_tests.sh
+++ b/run_tests.sh
@ -1,7 +1,7 @@
 TF_CPP_MIN_LOG_LEVEL=3

 # tests
-# nosetests tests -x &&\
+nosetests tests -x &&\

 # runtime tests
 ./tests/test_server_package.sh && \
--- a/tests/inputs/test_glow_tts.json
+++ b/tests/inputs/test_glow_tts.json
@ -0,0 +1,134 @@
+{
+    "model": "glow_tts",
+    "run_name": "glow-tts-gatedconv",
+    "run_description": "glow-tts model training with gated conv.",
+
+    // AUDIO PARAMETERS
+    "audio":{
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 0,     // reference level db, theoretically 20db is the sound of air.
+
+        // Griffin-Lim
+        "power": 1.1,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 1.0,         // scaler value appplied after log transform of spectrogram.
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 1.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // VOCABULARY PARAMETERS
+    // if custom character set is not defined,
+    // default set in symbols.py is used
+    // "characters":{
+    //     "pad": "_",
+    //     "eos": "~",
+    //     "bos": "^",
+    //     "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
+    //     "punctuations":"!'(),-.:;? ",
+    //     "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
+    // },
+
+    "add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
+
+    // DISTRIBUTED TRAINING
+    "mixed_precision": false,
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54323"
+    },
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // MODEL PARAMETERS
+    "use_mas": false,       // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
+
+    // TRAINING
+    "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size":1,
+    "r": 1,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
+    "loss_masking": true,   // enable / disable loss masking against the sequence padding.
+
+    // VALIDATION
+    "run_eval": true,
+    "test_delay_epochs": 0,       //Until attention is aligned, testing only wastes computation time.
+    "test_sentences_file": null,  // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
+
+    // OPTIMIZER
+    "noam_schedule": true,         // use noam warmup and lr schedule.
+    "grad_clip": 5.0,              // upper limit for gradients for clipping.
+    "epochs": 1,               // total number of epochs to train.
+    "lr": 1e-3,                    // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "wd": 0.000001,                // Weight decay weight.
+    "warmup_steps": 4000,          // Noam decay steps to increase the learning rate from 0 to "lr"
+    "seq_len_norm": false,         // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
+
+    "encoder_type": "gatedconv",
+
+    // TENSORBOARD and LOGGING
+    "print_step": 25,       // Number of steps to log training on console.
+    "tb_plot_step": 100,    // Number of steps to plot TB training figures.
+    "print_eval": false,     // If True, it prints intermediate loss values in evalulation.
+    "save_step": 5000,      // Number of training steps expected to save traninpg stats and checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "tb_model_param_stats": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+    "apex_amp_level": null,
+
+    // DATA LOADING
+    "text_cleaner": "phoneme_cleaners",
+    "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
+    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "batch_group_size": 0,  //Number of batches to shuffle after bucketing.
+    "min_seq_len": 3,       // DATASET-RELATED: minimum text length to use in training
+    "max_seq_len": 500,     // DATASET-RELATED: maximum text length
+    "compute_f0": false,     // compute f0 values in data-loader
+
+    // PATHS
+    "output_path": "tests/train_outputs/",
+
+    // PHONEMES
+    "phoneme_cache_path": "tests/outputs/phoneme_cache/",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
+    "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
+
+    // MULTI-SPEAKER and GST
+    "use_external_speaker_embedding_file": false,
+    "external_speaker_embedding_file": null,
+    "use_speaker_embedding": false,     // use speaker embedding to enable multi-speaker learning.
+
+    // DATASETS
+    "datasets":   // List of datasets. They all merged and they get different speaker_ids.
+        [
+            {
+                "name": "ljspeech",
+                "path": "tests/data/ljspeech/",
+                "meta_file_train": "metadata.csv",
+                "meta_file_val": "metadata.csv"
+            }
+        ]
+}
+
+
--- a/tests/inputs/test_vocoder_wavegrad.json
+++ b/tests/inputs/test_vocoder_wavegrad.json
@ -0,0 +1,113 @@
+{
+    "run_name": "wavegrad-ljspeech",
+    "run_description": "wavegrad ljspeech",
+
+    "audio":{
+        "fft_size": 1024,         // number of stft frequency levels. Size of the linear spectogram frame.
+        "win_length": 1024,      // stft window length in ms.
+        "hop_length": 256,       // stft window hop-lengh in ms.
+        "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,  // stft window hop-lengh in ms. If null, 'hop_length' is used.
+
+        // Audio processing parameters
+        "sample_rate": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "preemphasis": 0.0,     // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "ref_level_db": 0,     // reference level db, theoretically 20db is the sound of air.
+
+        // Silence trimming
+        "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+
+        // MelSpectrogram parameters
+        "num_mels": 80,         // size of the mel spec frame.
+        "mel_fmin": 50.0,        // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 7600.0,     // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 1.0,         // scaler value appplied after log transform of spectrogram.
+
+        // Normalization parameters
+        "signal_norm": true,    // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
+        "min_level_db": -100,   // lower bound for normalization
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,        // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "stats_path": null      // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+
+    // DISTRIBUTED TRAINING
+    "mixed_precision": false,
+    "distributed":{
+        "backend": "nccl",
+        "url": "tcp:\/\/localhost:54322"
+    },
+
+    "target_loss": "avg_wavegrad_loss",  // loss value to pick the best model to save after each epoch
+
+    // MODEL PARAMETERS
+    "generator_model": "wavegrad",
+    "model_params":{
+        "y_conv_channels":32,
+        "x_conv_channels":768,
+        "ublock_out_channels": [512, 512, 256, 128, 128],
+        "dblock_out_channels": [128, 128, 256, 512],
+        "upsample_factors": [4, 4, 4, 2, 2],
+        "upsample_dilations": [
+            [1, 2, 1, 2],
+            [1, 2, 1, 2],
+            [1, 2, 4, 8],
+            [1, 2, 4, 8],
+            [1, 2, 4, 8]]
+    },
+
+    // DATASET
+    "data_path": "tests/data/ljspeech/wavs/",  // root data path. It finds all wav files recursively from there.
+    "feature_path": null,   // if you use precomputed features
+    "seq_len": 6144,        // 24 * hop_length
+    "pad_short": 0,      // additional padding for short wavs
+    "conv_pad": 0,          // additional padding against convolutions applied to spectrograms
+    "use_noise_augment": false,     // add noise to the audio signal for augmentation
+    "use_cache": true,      // use in memory cache to keep the computed features. This might cause OOM.
+
+    "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
+
+    // TRAINING
+    "batch_size": 1,      // Batch size for training.
+    "train_noise_schedule":{
+        "min_val": 1e-6,
+        "max_val": 1e-2,
+        "num_steps": 1000
+    },
+    "test_noise_schedule":{
+        "min_val": 1e-6,
+        "max_val": 1e-2,
+        "num_steps": 2
+    },
+
+    // VALIDATION
+    "run_eval": true,       // enable/disable evaluation run
+
+    // OPTIMIZER
+    "epochs": 1,                // total number of epochs to train.
+    "clip_grad": 1.0,                 // Generator gradient clipping threshold. Apply gradient clipping if > 0
+    "lr_scheduler": "MultiStepLR",  // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
+    "lr_scheduler_params": {
+        "gamma": 0.5,
+        "milestones": [100000, 200000, 300000, 400000, 500000, 600000]
+    },
+    "lr": 1e-4,                  // Initial learning rate. If Noam decay is active, maximum learning rate.
+
+    // TENSORBOARD and LOGGING
+    "print_step": 250,       // Number of steps to log traning on console.
+    "print_eval": false,     // If True, it prints loss values for each step in eval run.
+    "save_step": 10000,      // Number of training steps expected to plot training stats on TB and save model checkpoints.
+    "checkpoint": true,     // If true, it saves checkpoints per "save_step"
+    "tb_model_param_stats": true,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
+
+    // DATA LOADING
+    "num_loader_workers": 4,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "num_val_loader_workers": 4,    // number of evaluation data loader processes.
+    "eval_split_size": 4,
+
+    // PATHS
+    "output_path": "tests/train_outputs/"
+}
+
--- a/tests/test_glow-tts_train.sh
+++ b/tests/test_glow-tts_train.sh
@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+BASEDIR=$(dirname "$0")
+echo "$BASEDIR"
+# run training
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_glow_tts.py --config_path $BASEDIR/inputs/test_glow_tts.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_glow_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/
--- a/tests/test_tts_train.sh
+++ b/tests/test_tts_train.sh
@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+BASEDIR=$(dirname "$0")
+echo "$BASEDIR"
+# run training
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/
--- a/tests/test_vocoder_wavegrad_train.sh
+++ b/tests/test_vocoder_wavegrad_train.sh
@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+BASEDIR=$(dirname "$0")
+echo "$BASEDIR"
+# create run dir
+mkdir $BASEDIR/train_outputs
+# run training
+CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavegrad.py --config_path $BASEDIR/inputs/test_vocoder_wavegrad.json
+# find the training folder
+LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
+echo $LATEST_FOLDER
+# continue the previous training
+CUDA_VISIBLE_DEVICES=""  python TTS/bin/train_vocoder_wavegrad.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
+# remove all the outputs
+rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER