From 71180c796227b9bcad75a43c1bd9cad53f99d1e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 21 Oct 2021 16:19:19 +0000 Subject: [PATCH] =?UTF-8?q?VCTK=20recipes=20(finally=20=F0=9F=9A=80)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- recipes/ljspeech/align_tts/train_aligntts.py | 2 +- .../ljspeech/fast_pitch/train_fast_pitch.py | 4 +- .../ljspeech/fast_speech/train_fast_speech.py | 2 +- recipes/ljspeech/glow_tts/train_glowtts.py | 3 +- .../speedy_speech/train_speedy_speech.py | 2 +- recipes/vctk/download_vctk.sh | 12 +++ recipes/vctk/fast_pitch/train_fast_pitch.py | 80 +++++++++++++++++ recipes/vctk/fast_speech/train_fast_speech.py | 80 +++++++++++++++++ recipes/vctk/glow_tts/train_glow_tts.py | 62 +++++++++++++ .../vctk/speedy_speech/train_speedy_speech.py | 80 +++++++++++++++++ .../vctk/tacotron-DDC/train_tacotron-DDC.py | 80 +++++++++++++++++ .../vctk/tacotron2-DDC/train_tacotron2-ddc.py | 87 +++++++++++++++++++ recipes/vctk/vits/train_vits.py | 86 ++++++++++++++++++ 13 files changed, 574 insertions(+), 6 deletions(-) create mode 100644 recipes/vctk/download_vctk.sh create mode 100644 recipes/vctk/fast_pitch/train_fast_pitch.py create mode 100644 recipes/vctk/fast_speech/train_fast_speech.py create mode 100644 recipes/vctk/glow_tts/train_glow_tts.py create mode 100644 recipes/vctk/speedy_speech/train_speedy_speech.py create mode 100644 recipes/vctk/tacotron-DDC/train_tacotron-DDC.py create mode 100644 recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py create mode 100644 recipes/vctk/vits/train_vits.py diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py index 76409374..68b67d66 100644 --- a/recipes/ljspeech/align_tts/train_aligntts.py +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -1,7 +1,7 @@ import os from TTS.trainer import Trainer, TrainingArgs -from TTS.tts.configs import AlignTTSConfig, BaseDatasetConfig +from TTS.tts.configs.align_tts_config import AlignTTSConfig, BaseDatasetConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.align_tts import AlignTTS from TTS.utils.audio import AudioProcessor diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index fead67a0..0a4a965b 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -1,8 +1,8 @@ import os -from TTS.config import BaseAudioConfig, BaseDatasetConfig +from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig from TTS.trainer import Trainer, TrainingArgs -from TTS.tts.configs import FastPitchConfig +from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.utils.audio import AudioProcessor diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py index 56557c26..a71da94b 100644 --- a/recipes/ljspeech/fast_speech/train_fast_speech.py +++ b/recipes/ljspeech/fast_speech/train_fast_speech.py @@ -2,7 +2,7 @@ import os from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.trainer import Trainer, TrainingArgs -from TTS.tts.configs import FastSpeechConfig +from TTS.tts.configs.fast_speech_config import FastSpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.utils.audio import AudioProcessor diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py index 29077eeb..6cfa3878 100644 --- a/recipes/ljspeech/glow_tts/train_glowtts.py +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -1,7 +1,8 @@ import os from TTS.trainer import Trainer, TrainingArgs -from TTS.tts.configs import BaseDatasetConfig, GlowTTSConfig +from TTS.tts.configs.glow_tts_config import GlowTTSConfig +from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.glow_tts import GlowTTS from TTS.utils.audio import AudioProcessor diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py index 974823ac..6b9683af 100644 --- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py +++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py @@ -2,7 +2,7 @@ import os from TTS.config import BaseAudioConfig, BaseDatasetConfig from TTS.trainer import Trainer, TrainingArgs -from TTS.tts.configs import SpeedySpeechConfig +from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.utils.audio import AudioProcessor diff --git a/recipes/vctk/download_vctk.sh b/recipes/vctk/download_vctk.sh new file mode 100644 index 00000000..c0cea743 --- /dev/null +++ b/recipes/vctk/download_vctk.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# take the scripts's parent's directory to prefix all the output paths. +RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +echo $RUN_DIR +# download LJSpeech dataset +wget https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip -O VCTK-Corpus-0.92.zip +# extract +mkdir VCTK +unzip VCTK-Corpus-0.92 -d VCTK +# create train-val splits +mv VCTK $RUN_DIR/recipes/vctk/ +rm VCTK-Corpus-0.92.zip diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py new file mode 100644 index 00000000..f40587e0 --- /dev/null +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -0,0 +1,80 @@ +import os + +from TTS.config import BaseAudioConfig, BaseDatasetConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.fast_pitch_config import FastPitchConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.forward_tts import ForwardTTS +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) + +audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=23.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, +) + +config = FastPitchConfig( + run_name="fast_pitch_ljspeech", + audio=audio_config, + batch_size=32, + eval_batch_size=16, + num_loader_workers=8, + num_eval_loader_workers=4, + compute_input_seq_cache=True, + compute_f0=True, + f0_cache_path=os.path.join(output_path, "f0_cache"), + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=50, + print_eval=False, + mixed_precision=False, + sort_by_audio_len=True, + max_seq_len=500000, + output_path=output_path, + datasets=[dataset_config], + use_speaker_embedding=True, +) + +# init audio processor +ap = AudioProcessor(**config.audio) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it maps speaker-id to speaker-name in the model and data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +config.model_args.num_speakers = speaker_manager.num_speakers + +# init model +model = ForwardTTS(config, speaker_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit() diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py new file mode 100644 index 00000000..b2988809 --- /dev/null +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -0,0 +1,80 @@ +import os + +from TTS.config import BaseAudioConfig, BaseDatasetConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.fast_speech_config import FastSpeechConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.forward_tts import ForwardTTS +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) + +audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=23.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, +) + +config = FastSpeechConfig( + run_name="fast_pitch_ljspeech", + audio=audio_config, + batch_size=32, + eval_batch_size=16, + num_loader_workers=8, + num_eval_loader_workers=4, + compute_input_seq_cache=True, + compute_f0=True, + f0_cache_path=os.path.join(output_path, "f0_cache"), + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=50, + print_eval=False, + mixed_precision=False, + sort_by_audio_len=True, + max_seq_len=500000, + output_path=output_path, + datasets=[dataset_config], + use_speaker_embedding=True, +) + +# init audio processor +ap = AudioProcessor(**config.audio) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it maps speaker-id to speaker-name in the model and data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +config.model_args.num_speakers = speaker_manager.num_speakers + +# init model +model = ForwardTTS(config, speaker_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit() diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py new file mode 100644 index 00000000..da8872db --- /dev/null +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -0,0 +1,62 @@ +import os + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.glow_tts_config import GlowTTSConfig +from TTS.tts.configs.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.glow_tts import GlowTTS +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) + +audio_config = BaseAudioConfig(sample_rate=22050, do_trim_silence=True, trim_db=23.0) + +config = GlowTTSConfig( + batch_size=64, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="phoneme_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=False, + mixed_precision=True, + output_path=output_path, + datasets=[dataset_config], + use_speaker_embedding=True, +) + +# init audio processor +ap = AudioProcessor(**config.audio.to_dict()) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it maps speaker-id to speaker-name in the model and data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +config.num_speakers = speaker_manager.num_speakers + +# init model +model = GlowTTS(config, speaker_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit() diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py new file mode 100644 index 00000000..81f78d26 --- /dev/null +++ b/recipes/vctk/speedy_speech/train_speedy_speech.py @@ -0,0 +1,80 @@ +import os + +from TTS.config import BaseAudioConfig, BaseDatasetConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.forward_tts import ForwardTTS +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) + +audio_config = BaseAudioConfig( + sample_rate=22050, + do_trim_silence=True, + trim_db=23.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, +) + +config = SpeedySpeechConfig( + run_name="fast_pitch_ljspeech", + audio=audio_config, + batch_size=32, + eval_batch_size=16, + num_loader_workers=8, + num_eval_loader_workers=4, + compute_input_seq_cache=True, + compute_f0=True, + f0_cache_path=os.path.join(output_path, "f0_cache"), + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=True, + use_espeak_phonemes=False, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=50, + print_eval=False, + mixed_precision=False, + sort_by_audio_len=True, + max_seq_len=500000, + output_path=output_path, + datasets=[dataset_config], + use_speaker_embedding=True, +) + +# init audio processor +ap = AudioProcessor(**config.audio) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it maps speaker-id to speaker-name in the model and data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +config.model_args.num_speakers = speaker_manager.num_speakers + +# init model +model = ForwardTTS(config, speaker_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit() diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py new file mode 100644 index 00000000..b0030f17 --- /dev/null +++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py @@ -0,0 +1,80 @@ +import os + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.shared_configs import BaseDatasetConfig +from TTS.tts.configs.tacotron_config import TacotronConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.tacotron import Tacotron +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) + +audio_config = BaseAudioConfig( + sample_rate=22050, + resample=True, # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training. + do_trim_silence=True, + trim_db=23.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + ref_level_db=20, + preemphasis=0.0, +) + +config = TacotronConfig( # This is the config that is saved for the future use + audio=audio_config, + batch_size=48, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + r=6, + gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], + double_decoder_consistency=True, + epochs=1000, + text_cleaner="phoneme_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=25, + print_eval=False, + mixed_precision=True, + sort_by_audio_len=True, + min_seq_len=0, + max_seq_len=44000 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio + output_path=output_path, + datasets=[dataset_config], + use_speaker_embedding=True, # set this to enable multi-sepeaker training +) + +# init audio processor +ap = AudioProcessor(**config.audio.to_dict()) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it mainly handles speaker-id to speaker-name for the model and the data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) + +# init model +model = Tacotron(config, speaker_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit() diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py new file mode 100644 index 00000000..346d650b --- /dev/null +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -0,0 +1,87 @@ +import os + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.shared_configs import BaseDatasetConfig +from TTS.tts.configs.tacotron2_config import Tacotron2Config +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.tacotron2 import Tacotron2 +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) + +audio_config = BaseAudioConfig( + sample_rate=22050, + resample=False, # Resample to 22050 Hz. It slows down training. Use `TTS/bin/resample.py` to pre-resample and set this False for faster training. + do_trim_silence=True, + trim_db=23.0, + signal_norm=False, + mel_fmin=0.0, + mel_fmax=8000, + spec_gain=1.0, + log_func="np.log", + preemphasis=0.0, +) + +config = Tacotron2Config( # This is the config that is saved for the future use + audio=audio_config, + batch_size=32, + eval_batch_size=16, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + r=2, + # gradual_training=[[0, 6, 48], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]], + double_decoder_consistency=False, + epochs=1000, + text_cleaner="phoneme_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + print_step=150, + print_eval=False, + mixed_precision=True, + sort_by_audio_len=True, + min_seq_len=14800, + max_seq_len=22050 * 10, # 44k is the original sampling rate before resampling, corresponds to 10 seconds of audio + output_path=output_path, + datasets=[dataset_config], + use_speaker_embedding=True, # set this to enable multi-sepeaker training + decoder_ssim_alpha=0.0, # disable ssim losses that causes NaN for some runs. + postnet_ssim_alpha=0.0, + postnet_diff_spec_alpha=0.0, + decoder_diff_spec_alpha=0.0, + attention_norm="softmax", + optimizer="Adam", + lr_scheduler=None, + lr=3e-5, +) + +# init audio processor +ap = AudioProcessor(**config.audio.to_dict()) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it mainly handles speaker-id to speaker-name for the model and the data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) + +# init model +model = Tacotron2(config, speaker_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit() diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py new file mode 100644 index 00000000..19074ce3 --- /dev/null +++ b/recipes/vctk/vits/train_vits.py @@ -0,0 +1,86 @@ +import os + +from TTS.config.shared_configs import BaseAudioConfig +from TTS.trainer import Trainer, TrainingArgs +from TTS.tts.configs.shared_configs import BaseDatasetConfig +from TTS.tts.configs.vits_config import VitsConfig +from TTS.tts.datasets import load_tts_samples +from TTS.tts.models.vits import Vits +from TTS.tts.utils.speakers import SpeakerManager +from TTS.utils.audio import AudioProcessor + +output_path = os.path.dirname(os.path.abspath(__file__)) +dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) + + +audio_config = BaseAudioConfig( + sample_rate=22050, + win_length=1024, + hop_length=256, + num_mels=80, + preemphasis=0.0, + ref_level_db=20, + log_func="np.log", + do_trim_silence=True, + trim_db=23.0, + mel_fmin=0, + mel_fmax=None, + spec_gain=1.0, + signal_norm=False, + do_amp_to_db_linear=False, + resample=True, +) + +config = VitsConfig( + audio=audio_config, + run_name="vits_vctk", + use_speaker_embedding=True, + batch_size=32, + eval_batch_size=16, + batch_group_size=5, + num_loader_workers=4, + num_eval_loader_workers=4, + run_eval=True, + test_delay_epochs=-1, + epochs=1000, + text_cleaner="english_cleaners", + use_phonemes=True, + phoneme_language="en-us", + phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), + compute_input_seq_cache=True, + print_step=25, + print_eval=False, + mixed_precision=True, + sort_by_audio_len=True, + min_seq_len=32 * 256 * 4, + max_seq_len=1500000, + output_path=output_path, + datasets=[dataset_config], +) + +# init audio processor +ap = AudioProcessor(**config.audio.to_dict()) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) + +# init speaker manager for multi-speaker training +# it maps speaker-id to speaker-name in the model and data-loader +speaker_manager = SpeakerManager() +speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +config.model_args.num_speakers = speaker_manager.num_speakers + +# init model +model = Vits(config, speaker_manager) + +# init the trainer and 🚀 +trainer = Trainer( + TrainingArgs(), + config, + output_path, + model=model, + train_samples=train_samples, + eval_samples=eval_samples, + training_assets={"audio_processor": ap}, +) +trainer.fit()