mirror of https://github.com/coqui-ai/TTS.git
adding more tests and refactoring
This commit is contained in:
parent
c76a617072
commit
116e2299b0
|
@ -0,0 +1,325 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Overview\n",
|
||||
"\n",
|
||||
"This notebook can be used with both a single or multi- speaker corpus and allows the interactive plotting of speaker embeddings linked to underlying audio (see instructions in the repo's speaker_embedding directory)\n",
|
||||
"\n",
|
||||
"Depending on the directory structure used for your corpus, you may need to adjust handling of **speaker_to_utter** and **locations**."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import glob\n",
|
||||
"import random\n",
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"import umap\n",
|
||||
"\n",
|
||||
"from TTS.speaker_encoder.model import SpeakerEncoder\n",
|
||||
"from TTS.tts.utils.audio import AudioProcessor\n",
|
||||
"from TTS.tts.utils.generic_utils import load_config\n",
|
||||
"\n",
|
||||
"from bokeh.io import output_notebook, show\n",
|
||||
"from bokeh.plotting import figure\n",
|
||||
"from bokeh.models import HoverTool, ColumnDataSource, BoxZoomTool, ResetTool, OpenURL, TapTool\n",
|
||||
"from bokeh.transform import factor_cmap, factor_mark\n",
|
||||
"from bokeh.palettes import Category10"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"For larger sets of speakers, you can use **Category20**, but you need to change it in the **pal** variable too\n",
|
||||
"\n",
|
||||
"List of Bokeh palettes here: http://docs.bokeh.org/en/1.4.0/docs/reference/palettes.html\n",
|
||||
"\n",
|
||||
"**NB:** if you have problems with other palettes, first see https://stackoverflow.com/questions/48333820/why-do-some-bokeh-palettes-raise-a-valueerror-when-used-in-factor-cmap"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output_notebook()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"You should also adjust all the path constants to point at the relevant locations for you locally"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
|
||||
"MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
|
||||
"CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
|
||||
"\n",
|
||||
"# My single speaker locations\n",
|
||||
"#EMBED_PATH = \"/home/neil/main/Projects/TTS3/embeddings/neil14/\"\n",
|
||||
"#AUDIO_PATH = \"/home/neil/data/Projects/NeilTTS/neil14/wavs/\"\n",
|
||||
"\n",
|
||||
"# My multi speaker locations\n",
|
||||
"EMBED_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360-embed_128/\"\n",
|
||||
"AUDIO_PATH = \"/home/erogol/Data/Libri-TTS/train-clean-360/\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!ls -1 $MODEL_RUN_PATH"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CONFIG = load_config(CONFIG_PATH)\n",
|
||||
"ap = AudioProcessor(**CONFIG['audio'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Bring in the embeddings created by **compute_embeddings.py**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embed_files = glob.glob(EMBED_PATH+\"/**/*.npy\", recursive=True)\n",
|
||||
"print(f'Embeddings found: {len(embed_files)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check that we did indeed find an embedding"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embed_files[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Process the speakers\n",
|
||||
"\n",
|
||||
"Assumes count of **speaker_paths** corresponds to number of speakers (so a corpus in just one directory would be treated like a single speaker and the multiple directories of LibriTTS are treated as distinct speakers)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"speaker_paths = list(set([os.path.dirname(os.path.dirname(embed_file)) for embed_file in embed_files]))\n",
|
||||
"speaker_to_utter = {}\n",
|
||||
"for embed_file in embed_files:\n",
|
||||
" speaker_path = os.path.dirname(os.path.dirname(embed_file))\n",
|
||||
" try:\n",
|
||||
" speaker_to_utter[speaker_path].append(embed_file)\n",
|
||||
" except:\n",
|
||||
" speaker_to_utter[speaker_path]=[embed_file]\n",
|
||||
"print(f'Speaker count: {len(speaker_paths)}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Set up the embeddings\n",
|
||||
"\n",
|
||||
"Adjust the number of speakers to select and the number of utterances from each speaker and they will be randomly sampled from the corpus"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeds = []\n",
|
||||
"labels = []\n",
|
||||
"locations = []\n",
|
||||
"\n",
|
||||
"# single speaker \n",
|
||||
"#num_speakers = 1\n",
|
||||
"#num_utters = 1000\n",
|
||||
"\n",
|
||||
"# multi speaker\n",
|
||||
"num_speakers = 10\n",
|
||||
"num_utters = 20\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"speaker_idxs = np.random.choice(range(len(speaker_paths)), num_speakers, replace=False )\n",
|
||||
"\n",
|
||||
"for speaker_num, speaker_idx in enumerate(speaker_idxs):\n",
|
||||
" speaker_path = speaker_paths[speaker_idx]\n",
|
||||
" speakers_utter = speaker_to_utter[speaker_path]\n",
|
||||
" utter_idxs = np.random.randint(0, len(speakers_utter) , num_utters)\n",
|
||||
" for utter_idx in utter_idxs:\n",
|
||||
" embed_path = speaker_to_utter[speaker_path][utter_idx]\n",
|
||||
" embed = np.load(embed_path)\n",
|
||||
" embeds.append(embed)\n",
|
||||
" labels.append(str(speaker_num))\n",
|
||||
" locations.append(embed_path.replace(EMBED_PATH, '').replace('.npy','.wav'))\n",
|
||||
"embeds = np.concatenate(embeds)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Load embeddings with UMAP"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = umap.UMAP()\n",
|
||||
"projection = model.fit_transform(embeds)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Interactively charting the data in Bokeh\n",
|
||||
"\n",
|
||||
"Set up various details for Bokeh to plot the data\n",
|
||||
"\n",
|
||||
"You can use the regular Bokeh [tools](http://docs.bokeh.org/en/1.4.0/docs/user_guide/tools.html?highlight=tools) to explore the data, with reset setting it back to normal\n",
|
||||
"\n",
|
||||
"Once you have started the local server (see cell below) you can then click on plotted points which will open a tab to play the audio for that point, enabling easy exploration of your corpus\n",
|
||||
"\n",
|
||||
"File location in the tooltip is given relative to **AUDIO_PATH**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"source_wav_stems = ColumnDataSource(\n",
|
||||
" data=dict(\n",
|
||||
" x = projection.T[0].tolist(),\n",
|
||||
" y = projection.T[1].tolist(),\n",
|
||||
" desc=locations,\n",
|
||||
" label=labels\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"hover = HoverTool(\n",
|
||||
" tooltips=[\n",
|
||||
" (\"file\", \"@desc\"),\n",
|
||||
" (\"speaker\", \"@label\"),\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# optionally consider adding these to the tooltips if you want additional detail\n",
|
||||
"# for the coordinates: (\"(x,y)\", \"($x, $y)\"),\n",
|
||||
"# for the index of the embedding / wav file: (\"index\", \"$index\"),\n",
|
||||
"\n",
|
||||
"factors = list(set(labels))\n",
|
||||
"pal_size = max(len(factors), 3)\n",
|
||||
"pal = Category10[pal_size]\n",
|
||||
"\n",
|
||||
"p = figure(plot_width=600, plot_height=400, tools=[hover,BoxZoomTool(), ResetTool(), TapTool()])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"p.circle('x', 'y', source=source_wav_stems, color=factor_cmap('label', palette=pal, factors=factors),)\n",
|
||||
"\n",
|
||||
"url = \"http://localhost:8000/@desc\"\n",
|
||||
"taptool = p.select(type=TapTool)\n",
|
||||
"taptool.callback = OpenURL(url=url)\n",
|
||||
"\n",
|
||||
"show(p)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Local server to serve wav files from corpus\n",
|
||||
"\n",
|
||||
"This is required so that when you click on a data point the hyperlink associated with it will be served the file locally.\n",
|
||||
"\n",
|
||||
"There are other ways to serve this if you prefer and you can also run the commands manually on the command line\n",
|
||||
"\n",
|
||||
"The server will continue to run until stopped. To stop it simply interupt the kernel (ie square button or under Kernel menu)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%cd $AUDIO_PATH\n",
|
||||
"%pwd\n",
|
||||
"!python -m http.server"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
TF_CPP_MIN_LOG_LEVEL=3
|
||||
|
||||
# tests
|
||||
# nosetests tests -x &&\
|
||||
nosetests tests -x &&\
|
||||
|
||||
# runtime tests
|
||||
./tests/test_server_package.sh && \
|
||||
|
|
|
@ -0,0 +1,134 @@
|
|||
{
|
||||
"model": "glow_tts",
|
||||
"run_name": "glow-tts-gatedconv",
|
||||
"run_description": "glow-tts model training with gated conv.",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 0, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Griffin-Lim
|
||||
"power": 1.1, // value to sharpen wav signals after GL algorithm.
|
||||
"griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 1.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// VOCABULARY PARAMETERS
|
||||
// if custom character set is not defined,
|
||||
// default set in symbols.py is used
|
||||
// "characters":{
|
||||
// "pad": "_",
|
||||
// "eos": "~",
|
||||
// "bos": "^",
|
||||
// "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
|
||||
// "punctuations":"!'(),-.:;? ",
|
||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
||||
// },
|
||||
|
||||
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"mixed_precision": false,
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54323"
|
||||
},
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":1,
|
||||
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
"test_delay_epochs": 0, //Until attention is aligned, testing only wastes computation time.
|
||||
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
||||
|
||||
// OPTIMIZER
|
||||
"noam_schedule": true, // use noam warmup and lr schedule.
|
||||
"grad_clip": 5.0, // upper limit for gradients for clipping.
|
||||
"epochs": 1, // total number of epochs to train.
|
||||
"lr": 1e-3, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
"wd": 0.000001, // Weight decay weight.
|
||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||
|
||||
"encoder_type": "gatedconv",
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log training on console.
|
||||
"tb_plot_step": 100, // Number of steps to plot TB training figures.
|
||||
"print_eval": false, // If True, it prints intermediate loss values in evalulation.
|
||||
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"apex_amp_level": null,
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
"enable_eos_bos_chars": false, // enable/disable beginning of sentence and end of sentence chars.
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"batch_group_size": 0, //Number of batches to shuffle after bucketing.
|
||||
"min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 500, // DATASET-RELATED: maximum text length
|
||||
"compute_f0": false, // compute f0 values in data-loader
|
||||
|
||||
// PATHS
|
||||
"output_path": "tests/train_outputs/",
|
||||
|
||||
// PHONEMES
|
||||
"phoneme_cache_path": "tests/outputs/phoneme_cache/", // phoneme computation is slow, therefore, it caches results in the given folder.
|
||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_external_speaker_embedding_file": false,
|
||||
"external_speaker_embedding_file": null,
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
|
||||
// DATASETS
|
||||
"datasets": // List of datasets. They all merged and they get different speaker_ids.
|
||||
[
|
||||
{
|
||||
"name": "ljspeech",
|
||||
"path": "tests/data/ljspeech/",
|
||||
"meta_file_train": "metadata.csv",
|
||||
"meta_file_val": "metadata.csv"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
{
|
||||
"run_name": "wavegrad-ljspeech",
|
||||
"run_description": "wavegrad ljspeech",
|
||||
|
||||
"audio":{
|
||||
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
||||
"win_length": 1024, // stft window length in ms.
|
||||
"hop_length": 256, // stft window hop-lengh in ms.
|
||||
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
||||
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
||||
|
||||
// Audio processing parameters
|
||||
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
||||
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
||||
"ref_level_db": 0, // reference level db, theoretically 20db is the sound of air.
|
||||
|
||||
// Silence trimming
|
||||
"do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
|
||||
// MelSpectrogram parameters
|
||||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
"min_level_db": -100, // lower bound for normalization
|
||||
"symmetric_norm": true, // move normalization to range [-1, 1]
|
||||
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
||||
"clip_norm": true, // clip normalized values into the range.
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
||||
// DISTRIBUTED TRAINING
|
||||
"mixed_precision": false,
|
||||
"distributed":{
|
||||
"backend": "nccl",
|
||||
"url": "tcp:\/\/localhost:54322"
|
||||
},
|
||||
|
||||
"target_loss": "avg_wavegrad_loss", // loss value to pick the best model to save after each epoch
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"generator_model": "wavegrad",
|
||||
"model_params":{
|
||||
"y_conv_channels":32,
|
||||
"x_conv_channels":768,
|
||||
"ublock_out_channels": [512, 512, 256, 128, 128],
|
||||
"dblock_out_channels": [128, 128, 256, 512],
|
||||
"upsample_factors": [4, 4, 4, 2, 2],
|
||||
"upsample_dilations": [
|
||||
[1, 2, 1, 2],
|
||||
[1, 2, 1, 2],
|
||||
[1, 2, 4, 8],
|
||||
[1, 2, 4, 8],
|
||||
[1, 2, 4, 8]]
|
||||
},
|
||||
|
||||
// DATASET
|
||||
"data_path": "tests/data/ljspeech/wavs/", // root data path. It finds all wav files recursively from there.
|
||||
"feature_path": null, // if you use precomputed features
|
||||
"seq_len": 6144, // 24 * hop_length
|
||||
"pad_short": 0, // additional padding for short wavs
|
||||
"conv_pad": 0, // additional padding against convolutions applied to spectrograms
|
||||
"use_noise_augment": false, // add noise to the audio signal for augmentation
|
||||
"use_cache": true, // use in memory cache to keep the computed features. This might cause OOM.
|
||||
|
||||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 1, // Batch size for training.
|
||||
"train_noise_schedule":{
|
||||
"min_val": 1e-6,
|
||||
"max_val": 1e-2,
|
||||
"num_steps": 1000
|
||||
},
|
||||
"test_noise_schedule":{
|
||||
"min_val": 1e-6,
|
||||
"max_val": 1e-2,
|
||||
"num_steps": 2
|
||||
},
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true, // enable/disable evaluation run
|
||||
|
||||
// OPTIMIZER
|
||||
"epochs": 1, // total number of epochs to train.
|
||||
"clip_grad": 1.0, // Generator gradient clipping threshold. Apply gradient clipping if > 0
|
||||
"lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
||||
"lr_scheduler_params": {
|
||||
"gamma": 0.5,
|
||||
"milestones": [100000, 200000, 300000, 400000, 500000, 600000]
|
||||
},
|
||||
"lr": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 250, // Number of steps to log traning on console.
|
||||
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
||||
"save_step": 10000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
|
||||
// DATA LOADING
|
||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
||||
"eval_split_size": 4,
|
||||
|
||||
// PATHS
|
||||
"output_path": "tests/train_outputs/"
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "$BASEDIR"
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_glow_tts.py --config_path $BASEDIR/inputs/test_glow_tts.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_glow_tts.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
|
@ -0,0 +1,13 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "$BASEDIR"
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --config_path $BASEDIR/inputs/test_train_config.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_tacotron.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/
|
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
BASEDIR=$(dirname "$0")
|
||||
echo "$BASEDIR"
|
||||
# create run dir
|
||||
mkdir $BASEDIR/train_outputs
|
||||
# run training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavegrad.py --config_path $BASEDIR/inputs/test_vocoder_wavegrad.json
|
||||
# find the training folder
|
||||
LATEST_FOLDER=$(ls $BASEDIR/train_outputs/| sort | tail -1)
|
||||
echo $LATEST_FOLDER
|
||||
# continue the previous training
|
||||
CUDA_VISIBLE_DEVICES="" python TTS/bin/train_vocoder_wavegrad.py --continue_path $BASEDIR/train_outputs/$LATEST_FOLDER
|
||||
# remove all the outputs
|
||||
rm -rf $BASEDIR/train_outputs/$LATEST_FOLDER
|
Loading…
Reference in New Issue