diff --git a/recipes/ljspeech/README.md b/recipes/ljspeech/README.md new file mode 100644 index 00000000..94508a7f --- /dev/null +++ b/recipes/ljspeech/README.md @@ -0,0 +1,19 @@ +# πŸΈπŸ’¬ TTS LJspeech Recipes + +For running the recipes + +1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```. +2. Go to your desired model folder and run the training. + + Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```) + ```terminal + CUDA_VISIBLE_DEVICES="0" python train_modelX.py + ``` + + Running bash scripts. + ```terminal + bash run.sh + ``` + +πŸ’‘ Note that these runs are just templates to help you start training your first model. They are not optimized for the best +result. Double-check the configurations and feel free to share your experiments to find better parameters together πŸ’ͺ. diff --git a/recipes/ljspeech/download_ljspeech.sh b/recipes/ljspeech/download_ljspeech.sh new file mode 100644 index 00000000..14ef058d --- /dev/null +++ b/recipes/ljspeech/download_ljspeech.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# download LJSpeech dataset +wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# extract +tar -xjf LJSpeech-1.1.tar.bz2 +# create train-val splits +shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +mv LJSpeech-1.1 $RUN_DIR/ +rm LJSpeech-1.1.tar.bz2 \ No newline at end of file diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py new file mode 100644 index 00000000..0a3c3838 --- /dev/null +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -0,0 +1,30 @@ +import os + +from TTS.tts.configs import GlowTTSConfig +from TTS.tts.configs import BaseDatasetConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")) +config = GlowTTSConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=True, + mixed_precision=False, + output_path=output_path, + datasets=[dataset_config] +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py new file mode 100644 index 00000000..99b39e99 --- /dev/null +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -0,0 +1,30 @@ +import os + +from TTS.vocoder.configs import HifiganConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = HifiganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py new file mode 100644 index 00000000..6b766ab7 --- /dev/null +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -0,0 +1,30 @@ +import os + +from TTS.vocoder.configs import MultibandMelganConfig +from TTS.trainer import init_training, Trainer, TrainingArgs + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = MultibandMelganConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=8192, + pad_short=2000, + use_noise_augment=True, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr_gen=1e-4, + lr_disc=1e-4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/tacotron2-DCA/run.sh b/recipes/ljspeech/tacotron2-DCA/run.sh new file mode 100644 index 00000000..8bcd9e3d --- /dev/null +++ b/recipes/ljspeech/tacotron2-DCA/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# # download LJSpeech dataset +# wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# # extract +# tar -xjf LJSpeech-1.1.tar.bz2 +# # create train-val splits +# shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +# head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +# tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +# mv LJSpeech-1.1 $RUN_DIR/ +# rm LJSpeech-1.1.tar.bz2 +# # compute dataset mean and variance for normalization +# python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ +# training .... +# change the GPU id if needed +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DCA.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path /media/erogol/nvme_linux/gdrive/Projects/TTS/recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/recipes/ljspeech/tacotron2-DCA/scale_stats.npy b/recipes/ljspeech/tacotron2-DCA/scale_stats.npy new file mode 100644 index 00000000..1dc577a6 Binary files /dev/null and b/recipes/ljspeech/tacotron2-DCA/scale_stats.npy differ diff --git a/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json b/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json new file mode 100644 index 00000000..c5b6fa52 --- /dev/null +++ b/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json @@ -0,0 +1,85 @@ +{ + "datasets": [ + { + "name": "ljspeech", + "path": "DEFINE THIS", + "meta_file_train": "metadata.csv", + "meta_file_val": null + } + ], + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_length_ms": null, + "frame_shift_ms": null, + "sample_rate": 22050, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_trim_silence": true, + "trim_db": 60, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 50.0, + "mel_fmax": 7600.0, + "spec_gain": 1, + "signal_norm": true, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": "scale_stats.npy" + }, + "distributed_backend": "nlcc", + "distributed_url": "tcp:\/\/localhost:54321", + "model": "Tacotron2", + "run_name": "ljspeech-dca", + "run_description": "tacotron2 with dynamic conv attention.", + "batch_size": 64, + "eval_batch_size": 16, + "mixed_precision": true, + "loss_masking": true, + "decoder_loss_alpha": 0.25, + "postnet_loss_alpha": 0.25, + "postnet_diff_spec_alpha": 0.25, + "decoder_diff_spec_alpha": 0.25, + "decoder_ssim_alpha": 0.25, + "postnet_ssim_alpha": 0.25, + "ga_alpha": 5.0, + "stopnet_pos_weight": 15.0, + "run_eval": true, + "test_delay_epochs": 10, + "max_decoder_steps": 50, + "noam_schedule": true, + "grad_clip": 0.05, + "epochs": 1000, + "lr": 0.001, + "wd": 1e-06, + "warmup_steps": 4000, + "memory_size": -1, + "prenet_type": "original", + "prenet_dropout": true, + "attention_type": "dynamic_convolution", + "location_attn": true, + "attention_norm": "sigmoid", + "r": 2, + "stopnet": true, + "separate_stopnet": true, + "print_step": 25, + "tb_plot_step": 100, + "print_eval": false, + "save_step": 10000, + "checkpoint": true, + "text_cleaner": "phoneme_cleaners", + "num_loader_workers": 4, + "num_val_loader_workers": 4, + "batch_group_size": 4, + "min_seq_len": 6, + "max_seq_len": 180, + "compute_input_seq_cache": true, + "output_path": "DEFINE THIS", + "phoneme_cache_path": "DEFINE THIS", + "use_phonemes": false, + "phoneme_language": "en-us" +} diff --git a/recipes/ljspeech/tacotron2-DDC/scale_stats.npy b/recipes/ljspeech/tacotron2-DDC/scale_stats.npy new file mode 100644 index 00000000..1dc577a6 Binary files /dev/null and b/recipes/ljspeech/tacotron2-DDC/scale_stats.npy differ diff --git a/recipes/ljspeech/wavegrad/train_wavegrad.py b/recipes/ljspeech/wavegrad/train_wavegrad.py new file mode 100644 index 00000000..323b2bb7 --- /dev/null +++ b/recipes/ljspeech/wavegrad/train_wavegrad.py @@ -0,0 +1,29 @@ +import os + +from TTS.trainer import Trainer, init_training +from TTS.trainer import TrainingArgs +from TTS.vocoder.configs import WavegradConfig + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = WavegradConfig( + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + seq_len=6144, + pad_short=2000, + use_noise_augment=True, + eval_split_size=50, + print_step=50, + print_eval=True, + mixed_precision=False, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger) +trainer.fit() diff --git a/recipes/ljspeech/wavernn/train_wavernn.py b/recipes/ljspeech/wavernn/train_wavernn.py new file mode 100644 index 00000000..76ff722a --- /dev/null +++ b/recipes/ljspeech/wavernn/train_wavernn.py @@ -0,0 +1,30 @@ +import os + +from TTS.trainer import Trainer, init_training, TrainingArgs +from TTS.vocoder.configs import WavernnConfig + + +output_path = os.path.dirname(os.path.abspath(__file__)) +config = WavernnConfig( + batch_size=64, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=10000, + seq_len=1280, + pad_short=2000, + use_noise_augment=False, + eval_split_size=10, + print_step=25, + print_eval=True, + mixed_precision=False, + lr=1e-4, + grad_clip=4, + data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"), + output_path=output_path, +) +args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config) +trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=True) +trainer.fit()