From 29d61741ecdc9c377cf3ff3bda622233304e7127 Mon Sep 17 00:00:00 2001 From: Katsuya Iida Date: Sat, 29 May 2021 19:03:23 +0900 Subject: [PATCH] Copied recipe --- recipes/kokoro/tacotron2-DDC/run.sh | 22 +++++ .../kokoro/tacotron2-DDC/tacotron2-DDC.json | 91 +++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 recipes/kokoro/tacotron2-DDC/run.sh create mode 100644 recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json diff --git a/recipes/kokoro/tacotron2-DDC/run.sh b/recipes/kokoro/tacotron2-DDC/run.sh new file mode 100644 index 00000000..eaa05b60 --- /dev/null +++ b/recipes/kokoro/tacotron2-DDC/run.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# download LJSpeech dataset +wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +# extract +tar -xjf LJSpeech-1.1.tar.bz2 +# create train-val splits +shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv +head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv +tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv +mv LJSpeech-1.1 $RUN_DIR/ +rm LJSpeech-1.1.tar.bz2 +# compute dataset mean and variance for normalization +python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/LJSpeech-1.1/wavs/ +# training .... +# change the GPU id if needed +CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \ + --coqpit.output_path $RUN_DIR \ + --coqpit.datasets.0.path $RUN_DIR/LJSpeech-1.1/ \ + --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \ \ No newline at end of file diff --git a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json new file mode 100644 index 00000000..9cdbbd3b --- /dev/null +++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json @@ -0,0 +1,91 @@ +{ + "datasets": [ + { + "name": "ljspeech", + "path": "DEFINE THIS", + "meta_file_train": "metadata.csv", + "meta_file_val": null + } + ], + "audio": { + "fft_size": 1024, + "win_length": 1024, + "hop_length": 256, + "frame_length_ms": null, + "frame_shift_ms": null, + "sample_rate": 22050, + "preemphasis": 0.0, + "ref_level_db": 20, + "do_trim_silence": true, + "trim_db": 60, + "power": 1.5, + "griffin_lim_iters": 60, + "num_mels": 80, + "mel_fmin": 50.0, + "mel_fmax": 7600.0, + "spec_gain": 1, + "signal_norm": true, + "min_level_db": -100, + "symmetric_norm": true, + "max_norm": 4.0, + "clip_norm": true, + "stats_path": "scale_stats.npy" + }, + "gst":{ + "gst_embedding_dim": 256, + "gst_num_heads": 4, + "gst_num_style_tokens": 10 + }, + "model": "Tacotron2", + "run_name": "ljspeech-ddc", + "run_description": "tacotron2 with double decoder consistency.", + "batch_size": 64, + "eval_batch_size": 16, + "mixed_precision": true, + "loss_masking": true, + "decoder_loss_alpha": 0.25, + "postnet_loss_alpha": 0.25, + "postnet_diff_spec_alpha": 0.25, + "decoder_diff_spec_alpha": 0.25, + "decoder_ssim_alpha": 0.25, + "postnet_ssim_alpha": 0.25, + "ga_alpha": 5.0, + "stopnet_pos_weight": 15.0, + "run_eval": true, + "test_delay_epochs": 10, + "test_sentences_file": null, + "noam_schedule": true, + "grad_clip": 0.05, + "epochs": 1000, + "lr": 0.001, + "wd": 1e-06, + "warmup_steps": 4000, + "memory_size": -1, + "prenet_type": "original", + "prenet_dropout": true, + "attention_type": "original", + "location_attn": true, + "double_decoder_consistency": true, + "ddc_r": 6, + "attention_norm": "sigmoid", + "r": 6, + "gradual_training": [[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], + "stopnet": true, + "separate_stopnet": true, + "print_step": 25, + "tb_plot_step": 100, + "print_eval": false, + "save_step": 10000, + "checkpoint": true, + "text_cleaner": "phoneme_cleaners", + "num_loader_workers": 4, + "num_val_loader_workers": 4, + "batch_group_size": 4, + "min_seq_len": 6, + "max_seq_len": 180, + "compute_input_seq_cache": true, + "output_path": "DEFINE THIS", + "phoneme_cache_path": "DEFINE THIS", + "use_phonemes": false, + "phoneme_language": "en-us" +} \ No newline at end of file