Create LJSpeech recipes for all the models

2021-06-18 15:09:40 +02:00 · 2021-06-18 15:09:40 +02:00 · 418c7d98d5
parent 0636c91919
commit 418c7d98d5
11 changed files with 289 additions and 0 deletions
--- a/recipes/ljspeech/README.md
+++ b/recipes/ljspeech/README.md
@ -0,0 +1,19 @@
+# 🐸💬 TTS LJspeech Recipes
+
+For running the recipes
+
+1. Download the LJSpeech dataset here either manually from [its official website](https://keithito.com/LJ-Speech-Dataset/) or using ```download_ljspeech.sh```.
+2. Go to your desired model folder and run the training.
+
+    Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
+    ```terminal
+    CUDA_VISIBLE_DEVICES="0" python train_modelX.py
+    ```
+
+    Running bash scripts.
+    ```terminal
+    bash run.sh
+    ```
+
+💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
+result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.
--- a/recipes/ljspeech/download_ljspeech.sh
+++ b/recipes/ljspeech/download_ljspeech.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+# take the scripts's parent's directory to prefix all the output paths.
+RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+echo $RUN_DIR
+# download LJSpeech dataset
+wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+# extract
+tar -xjf LJSpeech-1.1.tar.bz2
+# create train-val splits
+shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
+head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
+tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
+mv LJSpeech-1.1 $RUN_DIR/
+rm LJSpeech-1.1.tar.bz2
--- a/recipes/ljspeech/glow_tts/train_glowtts.py
+++ b/recipes/ljspeech/glow_tts/train_glowtts.py
@ -0,0 +1,30 @@
+import os
+
+from TTS.tts.configs import GlowTTSConfig
+from TTS.tts.configs import BaseDatasetConfig
+from TTS.trainer import init_training, Trainer, TrainingArgs
+
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/"))
+config = GlowTTSConfig(
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    text_cleaner="english_cleaners",
+    use_phonemes=False,
+    phoneme_language="en-us",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    output_path=output_path,
+    datasets=[dataset_config]
+)
+args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer.fit()
--- a/recipes/ljspeech/hifigan/train_hifigan.py
+++ b/recipes/ljspeech/hifigan/train_hifigan.py
@ -0,0 +1,30 @@
+import os
+
+from TTS.vocoder.configs import HifiganConfig
+from TTS.trainer import init_training, Trainer, TrainingArgs
+
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+config = HifiganConfig(
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    seq_len=8192,
+    pad_short=2000,
+    use_noise_augment=True,
+    eval_split_size=10,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    lr_gen=1e-4,
+    lr_disc=1e-4,
+    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
+    output_path=output_path,
+)
+args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer.fit()
--- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
+++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
@ -0,0 +1,30 @@
+import os
+
+from TTS.vocoder.configs import MultibandMelganConfig
+from TTS.trainer import init_training, Trainer, TrainingArgs
+
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+config = MultibandMelganConfig(
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    seq_len=8192,
+    pad_short=2000,
+    use_noise_augment=True,
+    eval_split_size=10,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    lr_gen=1e-4,
+    lr_disc=1e-4,
+    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
+    output_path=output_path,
+)
+args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer.fit()
--- a/recipes/ljspeech/tacotron2-DCA/run.sh
+++ b/recipes/ljspeech/tacotron2-DCA/run.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+# take the scripts's parent's directory to prefix all the output paths.
+RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+echo $RUN_DIR
+# # download LJSpeech dataset
+# wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+# # extract
+# tar -xjf LJSpeech-1.1.tar.bz2
+# # create train-val splits
+# shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
+# head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
+# tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
+# mv LJSpeech-1.1 $RUN_DIR/
+# rm LJSpeech-1.1.tar.bz2
+# # compute dataset mean and variance for normalization
+# python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/
+# training ....
+# change the GPU id if needed
+CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py --config_path $RUN_DIR/tacotron2-DCA.json \
+                                                     --coqpit.output_path $RUN_DIR  \
+                                                     --coqpit.datasets.0.path /media/erogol/nvme_linux/gdrive/Projects/TTS/recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/    \
+                                                     --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
--- a/recipes/ljspeech/tacotron2-DCA/scale_stats.npy
+++ b/recipes/ljspeech/tacotron2-DCA/scale_stats.npy
--- a/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json
+++ b/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json
@ -0,0 +1,85 @@
+{
+    "datasets": [
+        {
+            "name": "ljspeech",
+            "path": "DEFINE THIS",
+            "meta_file_train": "metadata.csv",
+            "meta_file_val": null
+        }
+    ],
+    "audio": {
+        "fft_size": 1024,
+        "win_length": 1024,
+        "hop_length": 256,
+        "frame_length_ms": null,
+        "frame_shift_ms": null,
+        "sample_rate": 22050,
+        "preemphasis": 0.0,
+        "ref_level_db": 20,
+        "do_trim_silence": true,
+        "trim_db": 60,
+        "power": 1.5,
+        "griffin_lim_iters": 60,
+        "num_mels": 80,
+        "mel_fmin": 50.0,
+        "mel_fmax": 7600.0,
+        "spec_gain": 1,
+        "signal_norm": true,
+        "min_level_db": -100,
+        "symmetric_norm": true,
+        "max_norm": 4.0,
+        "clip_norm": true,
+        "stats_path": "scale_stats.npy"
+    },
+    "distributed_backend": "nlcc",
+    "distributed_url": "tcp:\/\/localhost:54321",
+    "model": "Tacotron2",
+    "run_name": "ljspeech-dca",
+    "run_description": "tacotron2 with dynamic conv attention.",
+    "batch_size": 64,
+    "eval_batch_size": 16,
+    "mixed_precision": true,
+    "loss_masking": true,
+    "decoder_loss_alpha": 0.25,
+    "postnet_loss_alpha": 0.25,
+    "postnet_diff_spec_alpha": 0.25,
+    "decoder_diff_spec_alpha": 0.25,
+    "decoder_ssim_alpha": 0.25,
+    "postnet_ssim_alpha": 0.25,
+    "ga_alpha": 5.0,
+    "stopnet_pos_weight": 15.0,
+    "run_eval": true,
+    "test_delay_epochs": 10,
+    "max_decoder_steps": 50,
+    "noam_schedule": true,
+    "grad_clip": 0.05,
+    "epochs": 1000,
+    "lr": 0.001,
+    "wd": 1e-06,
+    "warmup_steps": 4000,
+    "memory_size": -1,
+    "prenet_type": "original",
+    "prenet_dropout": true,
+    "attention_type": "dynamic_convolution",
+    "location_attn": true,
+    "attention_norm": "sigmoid",
+    "r": 2,
+    "stopnet": true,
+    "separate_stopnet": true,
+    "print_step": 25,
+    "tb_plot_step": 100,
+    "print_eval": false,
+    "save_step": 10000,
+    "checkpoint": true,
+    "text_cleaner": "phoneme_cleaners",
+    "num_loader_workers": 4,
+    "num_val_loader_workers": 4,
+    "batch_group_size": 4,
+    "min_seq_len": 6,
+    "max_seq_len": 180,
+    "compute_input_seq_cache": true,
+    "output_path": "DEFINE THIS",
+    "phoneme_cache_path": "DEFINE THIS",
+    "use_phonemes": false,
+    "phoneme_language": "en-us"
+}
--- a/recipes/ljspeech/tacotron2-DDC/scale_stats.npy
+++ b/recipes/ljspeech/tacotron2-DDC/scale_stats.npy
--- a/recipes/ljspeech/wavegrad/train_wavegrad.py
+++ b/recipes/ljspeech/wavegrad/train_wavegrad.py
@ -0,0 +1,29 @@
+import os
+
+from TTS.trainer import Trainer, init_training
+from TTS.trainer import TrainingArgs
+from TTS.vocoder.configs import WavegradConfig
+
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+config = WavegradConfig(
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    seq_len=6144,
+    pad_short=2000,
+    use_noise_augment=True,
+    eval_split_size=50,
+    print_step=50,
+    print_eval=True,
+    mixed_precision=False,
+    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
+    output_path=output_path,
+)
+args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer.fit()
--- a/recipes/ljspeech/wavernn/train_wavernn.py
+++ b/recipes/ljspeech/wavernn/train_wavernn.py
@ -0,0 +1,30 @@
+import os
+
+from TTS.trainer import Trainer, init_training, TrainingArgs
+from TTS.vocoder.configs import WavernnConfig
+
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+config = WavernnConfig(
+    batch_size=64,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=10000,
+    seq_len=1280,
+    pad_short=2000,
+    use_noise_augment=False,
+    eval_split_size=10,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    lr=1e-4,
+    grad_clip=4,
+    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
+    output_path=output_path,
+)
+args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=True)
+trainer.fit()