From d7c2a8e834deb80897ccf746728c22712fab60d1 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 9 Dec 2022 18:19:31 -0300 Subject: [PATCH] Add automatic download and speaker embedding computation for YourTTS VCTK recipe --- recipes/vctk/yourtts/train_yourtts.py | 79 ++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 15 deletions(-) diff --git a/recipes/vctk/yourtts/train_yourtts.py b/recipes/vctk/yourtts/train_yourtts.py index 32a46024..b226880e 100644 --- a/recipes/vctk/yourtts/train_yourtts.py +++ b/recipes/vctk/yourtts/train_yourtts.py @@ -3,10 +3,13 @@ import os import torch from trainer import Trainer, TrainerArgs +from TTS.bin.compute_embeddings import compute_embeddings +from TTS.bin.resample import resample_files from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig +from TTS.utils.downloaders import download_vctk torch.set_num_threads(24) @@ -15,8 +18,10 @@ torch.set_num_threads(24) This recipe replicates the first experiment proposed in the YourTTS paper (https://arxiv.org/abs/2112.02418). YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes. In addition, YourTTS can be trained in multilingual data, however, this recipe replicates the single language training using the VCTK dataset. - The VitsArgs instance has commented parameters used to enable the multilingual training. + If you are interested in multilingual training, we have commented on parameters on the VitsArgs class instance that should be enabled for multilingual training. + In addition, you will need to add the extra datasets following the VCTK as an example. """ +CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) # Name of the run for the Trainer RUN_NAME = "YourTTS-EN-VCTK" @@ -25,7 +30,7 @@ RUN_NAME = "YourTTS-EN-VCTK" OUT_PATH = os.path.dirname(os.path.abspath(__file__)) # "/raid/coqui/Checkpoints/original-YourTTS/" # If you want to do transfer learning and speedup your training you can set here the path to the original YourTTS model -RESTORE_PATH = None # "/raid/coqui/Checkpoints/YourTTS/checkpoint.pth" +RESTORE_PATH = None # "/root/.local/share/tts/tts_models--multilingual--multi-dataset--your_tts/model_file.pth" # This paramter is usefull to debug, it skips the training epochs and just do the evaluation and produce the test sentences SKIP_TRAIN_EPOCH = False @@ -33,24 +38,61 @@ SKIP_TRAIN_EPOCH = False # Set here the batch size to be used in training and evaluation BATCH_SIZE = 32 -# To get the speakers.json or speakers.pth you need to follow the steps described at: https://github.com/Edresson/YourTTS#reproducibility -# or you can check the extract embedding script guidelines here: https://github.com/coqui-ai/TTS/blob/dev/TTS/bin/compute_embeddings.py#L20 -D_VECTOR_FILES = [ - "/raid/datasets/VCTK/speakers.json", -] +# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!) +# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios +SAMPLE_RATE = 16000 + +### Download VCTK dataset +VCTK_DOWNLOAD_PATH = os.path.join(CURRENT_PATH, "VCTK") +# Define the number of threads used during the audio resampling +NUM_RESAMPLE_THREADS = 10 +# Check if VCTK dataset is not already downloaded, if not download it +if not os.path.exists(VCTK_DOWNLOAD_PATH): + print(">>> Downloading VCTK dataset:") + download_vctk(VCTK_DOWNLOAD_PATH) + resample_files(VCTK_DOWNLOAD_PATH, SAMPLE_RATE, file_ext="flac", n_jobs=NUM_RESAMPLE_THREADS) -# Change our dataset paths to the VCTK dataset or replace it for others # init configs vctk_config = BaseDatasetConfig( - formatter="vctk", dataset_name="vctk", meta_file_train="metadata.csv", path="/raid/datasets/VCTK/", language="en" + formatter="vctk", dataset_name="vctk", meta_file_train="", path=VCTK_DOWNLOAD_PATH, language="en" ) -# add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK -datasets_list = [vctk_config] +# Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :) +DATASETS_CONFIG_LIST = [vctk_config] -# Audio config used in training. Please: Check if your dataset sampling rate and the parameter sample_rate here are matching, otherwise resample your audios +### Extract speaker embeddings +SPEAKER_ENCODER_CHECKPOINT_PATH = ( + "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar" +) +SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json" + +D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training + +# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it +for dataset_conf in DATASETS_CONFIG_LIST: + # Check if the embeddings weren't already computed, if not compute it + embeddings_file = os.path.join(dataset_conf.path, "speakers.pth") + if not os.path.isfile(embeddings_file): + print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset") + compute_embeddings( + SPEAKER_ENCODER_CHECKPOINT_PATH, + SPEAKER_ENCODER_CONFIG_PATH, + embeddings_file, + old_spakers_file=None, + config_dataset_path=None, + formatter_name=dataset_conf.formatter, + dataset_name=dataset_conf.dataset_name, + dataset_path=dataset_conf.path, + meta_file_train=dataset_conf.meta_file_train, + disable_cuda=False, + no_eval=False, + ) + D_VECTOR_FILES.append(embeddings_file) + + +# Audio config used in training. audio_config = VitsAudioConfig( - sample_rate=22050, + sample_rate=SAMPLE_RATE, hop_length=256, win_length=1024, fft_size=1024, @@ -65,7 +107,12 @@ model_args = VitsArgs( use_d_vector_file=True, d_vector_dim=512, num_layers_text_encoder=10, - # usefull parameters to the enable multilingual training + resblock_type_decoder="2", # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model + # Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper + # use_speaker_encoder_as_loss=True, + # speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH, + # speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH, + # Usefull parameters to the enable multilingual training # use_language_embedding=True, # embedded_language_dim=4, ) @@ -104,7 +151,7 @@ config = VitsConfig( phoneme_cache_path=None, precompute_num_workers=12, start_by_longest=True, - datasets=datasets_list, + datasets=DATASETS_CONFIG_LIST, cudnn_benchmark=False, max_audio_len=220500, # it should be: sampling rate * max audio in sec. So it is 22050 * 10 = 220500 mixed_precision=False, @@ -144,6 +191,8 @@ config = VitsConfig( use_weighted_sampler=True, # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has weighted_sampler_attrs={"speaker_name": 1.0}, + # It defines the Speaker Consistency Loss (SCL) α to 9 like the paper + speaker_encoder_loss_alpha=9.0, ) # Load all the datasets samples and split traning and evaluation sets