Create LJSpeech recipes for all the models

This commit is contained in:
Eren Gölge 2021-06-18 15:09:40 +02:00
parent 0636c91919
commit 418c7d98d5
11 changed files with 289 additions and 0 deletions

View File

@ -0,0 +1,19 @@
# 🐸💬 TTS LJspeech Recipes
For running the recipes
1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```.
2. Go to your desired model folder and run the training.
Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
```terminal
CUDA_VISIBLE_DEVICES="0" python train_modelX.py
```
Running bash scripts.
```terminal
bash run.sh
```
💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.

View File

@ -0,0 +1,14 @@
#!/bin/bash
# take the scripts's parent's directory to prefix all the output paths.
RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
echo $RUN_DIR
# download LJSpeech dataset
wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
# extract
tar -xjf LJSpeech-1.1.tar.bz2
# create train-val splits
shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
mv LJSpeech-1.1 $RUN_DIR/
rm LJSpeech-1.1.tar.bz2

View File

@ -0,0 +1,30 @@
import os
from TTS.tts.configs import GlowTTSConfig
from TTS.tts.configs import BaseDatasetConfig
from TTS.trainer import init_training, Trainer, TrainingArgs
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/"))
config = GlowTTSConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="english_cleaners",
use_phonemes=False,
phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=True,
mixed_precision=False,
output_path=output_path,
datasets=[dataset_config]
)
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
trainer.fit()

View File

@ -0,0 +1,30 @@
import os
from TTS.vocoder.configs import HifiganConfig
from TTS.trainer import init_training, Trainer, TrainingArgs
output_path = os.path.dirname(os.path.abspath(__file__))
config = HifiganConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=True,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
output_path=output_path,
)
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
trainer.fit()

View File

@ -0,0 +1,30 @@
import os
from TTS.vocoder.configs import MultibandMelganConfig
from TTS.trainer import init_training, Trainer, TrainingArgs
output_path = os.path.dirname(os.path.abspath(__file__))
config = MultibandMelganConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=True,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
output_path=output_path,
)
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
trainer.fit()

View File

@ -0,0 +1,22 @@
#!/bin/bash
# take the scripts's parent's directory to prefix all the output paths.
RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
echo $RUN_DIR
# # download LJSpeech dataset
# wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
# # extract
# tar -xjf LJSpeech-1.1.tar.bz2
# # create train-val splits
# shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
# head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
# tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
# mv LJSpeech-1.1 $RUN_DIR/
# rm LJSpeech-1.1.tar.bz2
# # compute dataset mean and variance for normalization
# python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/
# training ....
# change the GPU id if needed
CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DCA.json \
--coqpit.output_path $RUN_DIR \
--coqpit.datasets.0.path /media/erogol/nvme_linux/gdrive/Projects/TTS/recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/ \
--coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \

Binary file not shown.

View File

@ -0,0 +1,85 @@
{
"datasets": [
{
"name": "ljspeech",
"path": "DEFINE THIS",
"meta_file_train": "metadata.csv",
"meta_file_val": null
}
],
"audio": {
"fft_size": 1024,
"win_length": 1024,
"hop_length": 256,
"frame_length_ms": null,
"frame_shift_ms": null,
"sample_rate": 22050,
"preemphasis": 0.0,
"ref_level_db": 20,
"do_trim_silence": true,
"trim_db": 60,
"power": 1.5,
"griffin_lim_iters": 60,
"num_mels": 80,
"mel_fmin": 50.0,
"mel_fmax": 7600.0,
"spec_gain": 1,
"signal_norm": true,
"min_level_db": -100,
"symmetric_norm": true,
"max_norm": 4.0,
"clip_norm": true,
"stats_path": "scale_stats.npy"
},
"distributed_backend": "nlcc",
"distributed_url": "tcp:\/\/localhost:54321",
"model": "Tacotron2",
"run_name": "ljspeech-dca",
"run_description": "tacotron2 with dynamic conv attention.",
"batch_size": 64,
"eval_batch_size": 16,
"mixed_precision": true,
"loss_masking": true,
"decoder_loss_alpha": 0.25,
"postnet_loss_alpha": 0.25,
"postnet_diff_spec_alpha": 0.25,
"decoder_diff_spec_alpha": 0.25,
"decoder_ssim_alpha": 0.25,
"postnet_ssim_alpha": 0.25,
"ga_alpha": 5.0,
"stopnet_pos_weight": 15.0,
"run_eval": true,
"test_delay_epochs": 10,
"max_decoder_steps": 50,
"noam_schedule": true,
"grad_clip": 0.05,
"epochs": 1000,
"lr": 0.001,
"wd": 1e-06,
"warmup_steps": 4000,
"memory_size": -1,
"prenet_type": "original",
"prenet_dropout": true,
"attention_type": "dynamic_convolution",
"location_attn": true,
"attention_norm": "sigmoid",
"r": 2,
"stopnet": true,
"separate_stopnet": true,
"print_step": 25,
"tb_plot_step": 100,
"print_eval": false,
"save_step": 10000,
"checkpoint": true,
"text_cleaner": "phoneme_cleaners",
"num_loader_workers": 4,
"num_val_loader_workers": 4,
"batch_group_size": 4,
"min_seq_len": 6,
"max_seq_len": 180,
"compute_input_seq_cache": true,
"output_path": "DEFINE THIS",
"phoneme_cache_path": "DEFINE THIS",
"use_phonemes": false,
"phoneme_language": "en-us"
}

Binary file not shown.

View File

@ -0,0 +1,29 @@
import os
from TTS.trainer import Trainer, init_training
from TTS.trainer import TrainingArgs
from TTS.vocoder.configs import WavegradConfig
output_path = os.path.dirname(os.path.abspath(__file__))
config = WavegradConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
seq_len=6144,
pad_short=2000,
use_noise_augment=True,
eval_split_size=50,
print_step=50,
print_eval=True,
mixed_precision=False,
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
output_path=output_path,
)
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
trainer = Trainer(args, config, output_path, c_logger, tb_logger)
trainer.fit()

View File

@ -0,0 +1,30 @@
import os
from TTS.trainer import Trainer, init_training, TrainingArgs
from TTS.vocoder.configs import WavernnConfig
output_path = os.path.dirname(os.path.abspath(__file__))
config = WavernnConfig(
batch_size=64,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=10000,
seq_len=1280,
pad_short=2000,
use_noise_augment=False,
eval_split_size=10,
print_step=25,
print_eval=True,
mixed_precision=False,
lr=1e-4,
grad_clip=4,
data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
output_path=output_path,
)
args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=True)
trainer.fit()