diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/PlotUmapLibriTTS.ipynb new file mode 100644 index 00000000..46e1ac67 --- /dev/null +++ b/notebooks/PlotUmapLibriTTS.ipynb @@ -0,0 +1,325 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Overview\n", + "\n", + "This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n", + "\n", + "Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import glob\n", + "import random\n", + "import numpy as np\n", + "import torch\n", + "import umap\n", + "\n", + "from TTS.speaker_encoder.model import SpeakerEncoder\n", + "from TTS.tts.utils.audio import AudioProcessor\n", + "from TTS.tts.utils.generic_utils import load_config\n", + "\n", + "from bokeh.io import output_notebook, show\n", + "from bokeh.plotting import figure\n", + "from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n", + "from bokeh.transform import factor_cmap, factor_mark\n", + "from bokeh.palettes import Category10" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n", + "\n", + "List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n", + "\n", + "**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_notebook()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should also adjust all the path constants to point at the relevant locations for you locally" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", + "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", + "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", + "\n", + "# My single speaker locations\n", + "#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n", + "#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n", + "\n", + "# My multi speaker locations\n", + "EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n", + "AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!ls -1 $MODEL_RUN_PATH" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CONFIG = load_config(CONFIG_PATH)\n", + "ap = AudioProcessor(**CONFIG['audio'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Bring in the embeddings created by **compute_embeddings.py**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n", + "print(f'Embeddings found: {len(embed_files)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check that we did indeed find an embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embed_files[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Process the speakers\n", + "\n", + "Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n", + "speaker_to_utter = {}\n", + "for embed_file in embed_files:\n", + " speaker_path = os.path.dirname(os.path.dirname(embed_file))\n", + " try:\n", + " speaker_to_utter[speaker_path].append(embed_file)\n", + " except:\n", + " speaker_to_utter[speaker_path]=[embed_file]\n", + "print(f'Speaker count: {len(speaker_paths)}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up the embeddings\n", + "\n", + "Adjust the number of speakers to select and the number of utterances from each speaker and they will be randomly sampled from the corpus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeds = []\n", + "labels = []\n", + "locations = []\n", + "\n", + "# single speaker \n", + "#num_speakers = 1\n", + "#num_utters = 1000\n", + "\n", + "# multi speaker\n", + "num_speakers = 10\n", + "num_utters = 20\n", + "\n", + "\n", + "speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n", + "\n", + "for speaker_num, speaker_idx in enumerate(speaker_idxs):\n", + " speaker_path = speaker_paths[speaker_idx]\n", + " speakers_utter = speaker_to_utter[speaker_path]\n", + " utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n", + " for utter_idx in utter_idxs:\n", + " embed_path = speaker_to_utter[speaker_path][utter_idx]\n", + " embed = np.load(embed_path)\n", + " embeds.append(embed)\n", + " labels.append(str(speaker_num))\n", + " locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n", + "embeds = np.concatenate(embeds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load embeddings with UMAP" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = umap.UMAP()\n", + "projection = model.fit_transform(embeds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interactively charting the data in Bokeh\n", + "\n", + "Set up various details for Bokeh to plot the data\n", + "\n", + "You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n", + "\n", + "Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n", + "\n", + "File location in the tooltip is given relative to **AUDIO_PATH**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "source_wav_stems = ColumnDataSource(\n", + " data=dict(\n", + " x = projection.T[0].tolist(),\n", + " y = projection.T[1].tolist(),\n", + " desc=locations,\n", + " label=labels\n", + " )\n", + " )\n", + "\n", + "hover = HoverTool(\n", + " tooltips=[\n", + " (\"file\", \"@desc\"),\n", + " (\"speaker\", \"@label\"),\n", + " ]\n", + " )\n", + "\n", + "# optionally consider adding these to the tooltips if you want additional detail\n", + "# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n", + "# for the index of the embedding / wav file: (\"index\", \"$index\"),\n", + "\n", + "factors = list(set(labels))\n", + "pal_size = max(len(factors), 3)\n", + "pal = Category10[pal_size]\n", + "\n", + "p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n", + "\n", + "\n", + "p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n", + "\n", + "url = \"http://localhost:8000/@desc\"\n", + "taptool = p.select(type=TapTool)\n", + "taptool.callback = OpenURL(url=url)\n", + "\n", + "show(p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Local server to serve wav files from corpus\n", + "\n", + "This is required so that when you click on a data point the hyperlink associated with it will be served the file locally.\n", + "\n", + "There are other ways to serve this if you prefer and you can also run the commands manually on the command line\n", + "\n", + "The server will continue to run until stopped. To stop it simply interupt the kernel (ie square button or under Kernel menu)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%cd $AUDIO_PATH\n", + "%pwd\n", + "!python -m http.server" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/run_tests.sh b/run_tests.sh index 46f18f01..5cd89564 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,7 +1,7 @@ TF_CPP_MIN_LOG_LEVEL=3 # tests -# nosetests tests -x &&\ +nosetests tests -x &&\ # runtime tests ./tests/test_server_package.sh && \ diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json new file mode 100644 index 00000000..c1bc33fd --- /dev/null +++ b/tests/inputs/test_glow_tts.json @@ -0,0 +1,134 @@ +{ + "model": "glow_tts", + "run_name": "glow-tts-gatedconv", + "run_description": "glow-tts model training with gated conv.", + + // AUDIO PARAMETERS + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Griffin-Lim + "power": 1.1, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // VOCABULARY PARAMETERS + // if custom character set is not defined, + // default set in symbols.py is used + // "characters":{ + // "pad": "_", + // "eos": "~", + // "bos": "^", + // "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", + // "punctuations":"!'(),-.:;? ", + // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" + // }, + + "add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model. + + // DISTRIBUTED TRAINING + "mixed_precision": false, + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54323" + }, + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // MODEL PARAMETERS + "use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments. + + // TRAINING + "batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size":1, + "r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. + "loss_masking": true, // enable / disable loss masking against the sequence padding. + + // VALIDATION + "run_eval": true, + "test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time. + "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. + + // OPTIMIZER + "noam_schedule": true, // use noam warmup and lr schedule. + "grad_clip": 5.0, // upper limit for gradients for clipping. + "epochs": 1, // total number of epochs to train. + "lr": 1e-3, // Initial learning rate. If Noam decay is active, maximum learning rate. + "wd": 0.000001, // Weight decay weight. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. + + "encoder_type": "gatedconv", + + // TENSORBOARD and LOGGING + "print_step": 25, // Number of steps to log training on console. + "tb_plot_step": 100, // Number of steps to plot TB training figures. + "print_eval": false, // If True, it prints intermediate loss values in evalulation. + "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "apex_amp_level": null, + + // DATA LOADING + "text_cleaner": "phoneme_cleaners", + "enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "batch_group_size": 0, //Number of batches to shuffle after bucketing. + "min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training + "max_seq_len": 500, // DATASET-RELATED: maximum text length + "compute_f0": false, // compute f0 values in data-loader + + // PATHS + "output_path": "tests/train_outputs/", + + // PHONEMES + "phoneme_cache_path": "tests/outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder. + "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. + "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages + + // MULTI-SPEAKER and GST + "use_external_speaker_embedding_file": false, + "external_speaker_embedding_file": null, + "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. + + // DATASETS + "datasets": // List of datasets. They all merged and they get different speaker_ids. + [ + { + "name": "ljspeech", + "path": "tests/data/ljspeech/", + "meta_file_train": "metadata.csv", + "meta_file_val": "metadata.csv" + } + ] +} + + diff --git a/tests/inputs/test_vocoder_wavegrad.json b/tests/inputs/test_vocoder_wavegrad.json new file mode 100644 index 00000000..f7da5980 --- /dev/null +++ b/tests/inputs/test_vocoder_wavegrad.json @@ -0,0 +1,113 @@ +{ + "run_name": "wavegrad-ljspeech", + "run_description": "wavegrad ljspeech", + + "audio":{ + "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. + "win_length": 1024, // stft window length in ms. + "hop_length": 256, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + + // Audio processing parameters + "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. + + // Silence trimming + "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60, // threshold for timming silence. Set this according to your dataset. + + // MelSpectrogram parameters + "num_mels": 80, // size of the mel spec frame. + "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! + "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. + + // Normalization parameters + "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. + "min_level_db": -100, // lower bound for normalization + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored + }, + + // DISTRIBUTED TRAINING + "mixed_precision": false, + "distributed":{ + "backend": "nccl", + "url": "tcp:\/\/localhost:54322" + }, + + "target_loss": "avg_wavegrad_loss", // loss value to pick the best model to save after each epoch + + // MODEL PARAMETERS + "generator_model": "wavegrad", + "model_params":{ + "y_conv_channels":32, + "x_conv_channels":768, + "ublock_out_channels": [512, 512, 256, 128, 128], + "dblock_out_channels": [128, 128, 256, 512], + "upsample_factors": [4, 4, 4, 2, 2], + "upsample_dilations": [ + [1, 2, 1, 2], + [1, 2, 1, 2], + [1, 2, 4, 8], + [1, 2, 4, 8], + [1, 2, 4, 8]] + }, + + // DATASET + "data_path": "tests/data/ljspeech/wavs/", // root data path. It finds all wav files recursively from there. + "feature_path": null, // if you use precomputed features + "seq_len": 6144, // 24 * hop_length + "pad_short": 0, // additional padding for short wavs + "conv_pad": 0, // additional padding against convolutions applied to spectrograms + "use_noise_augment": false, // add noise to the audio signal for augmentation + "use_cache": true, // use in memory cache to keep the computed features. This might cause OOM. + + "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. + + // TRAINING + "batch_size": 1, // Batch size for training. + "train_noise_schedule":{ + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 1000 + }, + "test_noise_schedule":{ + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 2 + }, + + // VALIDATION + "run_eval": true, // enable/disable evaluation run + + // OPTIMIZER + "epochs": 1, // total number of epochs to train. + "clip_grad": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0 + "lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate + "lr_scheduler_params": { + "gamma": 0.5, + "milestones": [100000, 200000, 300000, 400000, 500000, 600000] + }, + "lr": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. + + // TENSORBOARD and LOGGING + "print_step": 250, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values for each step in eval run. + "save_step": 10000, // Number of training steps expected to plot training stats on TB and save model checkpoints. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + + // DATA LOADING + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4, // number of evaluation data loader processes. + "eval_split_size": 4, + + // PATHS + "output_path": "tests/train_outputs/" +} + diff --git a/tests/test_glow-tts_train.sh b/tests/test_glow-tts_train.sh new file mode 100755 index 00000000..c8dd3e22 --- /dev/null +++ b/tests/test_glow-tts_train.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_glow_tts.py --config_path $BASEDIR/inputs/test_glow_tts.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_glow_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ diff --git a/tests/test_tts_train.sh b/tests/test_tts_train.sh new file mode 100755 index 00000000..ed0871eb --- /dev/null +++ b/tests/test_tts_train.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/ diff --git a/tests/test_vocoder_wavegrad_train.sh b/tests/test_vocoder_wavegrad_train.sh new file mode 100755 index 00000000..b5e6e451 --- /dev/null +++ b/tests/test_vocoder_wavegrad_train.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +BASEDIR=$(dirname "$0") +echo "$BASEDIR" +# create run dir +mkdir $BASEDIR/train_outputs +# run training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavegrad.py --config_path $BASEDIR/inputs/test_vocoder_wavegrad.json +# find the training folder +LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1) +echo $LATEST_FOLDER +# continue the previous training +CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavegrad.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER +# remove all the outputs +rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER \ No newline at end of file