From a066e14fb1cf815268214fa08c42768c7eee3884 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Sat, 10 Dec 2022 13:49:57 -0300 Subject: [PATCH] Add parameter for eval metadata file on compute embeddings function --- TTS/bin/compute_embeddings.py | 21 ++++++++++++++++----- recipes/vctk/yourtts/train_yourtts.py | 8 ++++++-- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index ace6deef..7e0932cc 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -22,6 +22,7 @@ def compute_embeddings( dataset_name=None, dataset_path=None, meta_file_train=None, + meta_file_val=None, disable_cuda=False, no_eval=False, ): @@ -35,7 +36,10 @@ def compute_embeddings( c_dataset.formatter = formatter_name c_dataset.dataset_name = dataset_name c_dataset.path = dataset_path - c_dataset.meta_file_train = meta_file_train if meta_file_train else None + if meta_file_train is not None: + c_dataset.meta_file_train = meta_file_train + if meta_file_val is not None: + c_dataset.meta_file_val = meta_file_val meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval) if meta_data_eval is None: @@ -92,7 +96,7 @@ if __name__ == "__main__": Example runs: python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json - python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv + python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv """, formatter_class=RawTextHelpFormatter, ) @@ -139,9 +143,15 @@ if __name__ == "__main__": default=None, ) parser.add_argument( - "--metafile", + "--meta_file_train", type=str, - help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", + help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", + default=None, + ) + parser.add_argument( + "--meta_file_val", + type=str, + help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`", default=None, ) args = parser.parse_args() @@ -155,7 +165,8 @@ if __name__ == "__main__": formatter_name=args.formatter_name, dataset_name=args.dataset_name, dataset_path=args.dataset_path, - meta_file_train=args.metafile, + meta_file_train=args.meta_file_train, + meta_file_val=args.meta_file_val, disable_cuda=args.disable_cuda, no_eval=args.no_eval, ) diff --git a/recipes/vctk/yourtts/train_yourtts.py b/recipes/vctk/yourtts/train_yourtts.py index b226880e..1487a9fc 100644 --- a/recipes/vctk/yourtts/train_yourtts.py +++ b/recipes/vctk/yourtts/train_yourtts.py @@ -42,6 +42,9 @@ BATCH_SIZE = 32 # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios SAMPLE_RATE = 16000 +# Max audio length in seconds to be used in training (every audio bigger than it will be ignored) +MAX_AUDIO_LEN_IN_SECONDS = 10 + ### Download VCTK dataset VCTK_DOWNLOAD_PATH = os.path.join(CURRENT_PATH, "VCTK") # Define the number of threads used during the audio resampling @@ -54,7 +57,7 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH): # init configs vctk_config = BaseDatasetConfig( - formatter="vctk", dataset_name="vctk", meta_file_train="", path=VCTK_DOWNLOAD_PATH, language="en" + formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en" ) # Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :) @@ -84,6 +87,7 @@ for dataset_conf in DATASETS_CONFIG_LIST: dataset_name=dataset_conf.dataset_name, dataset_path=dataset_conf.path, meta_file_train=dataset_conf.meta_file_train, + meta_file_val=dataset_conf.meta_file_val, disable_cuda=False, no_eval=False, ) @@ -153,7 +157,7 @@ config = VitsConfig( start_by_longest=True, datasets=DATASETS_CONFIG_LIST, cudnn_benchmark=False, - max_audio_len=220500, # it should be: sampling rate * max audio in sec. So it is 22050 * 10 = 220500 + max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS, mixed_precision=False, test_sentences=[ [