Add parameter for eval metadata file on compute embeddings function

This commit is contained in:
Edresson Casanova 2022-12-10 13:49:57 -03:00
parent d7c2a8e834
commit a066e14fb1
2 changed files with 22 additions and 7 deletions

View File

@ -22,6 +22,7 @@ def compute_embeddings(
dataset_name=None,
dataset_path=None,
meta_file_train=None,
meta_file_val=None,
disable_cuda=False,
no_eval=False,
):
@ -35,7 +36,10 @@ def compute_embeddings(
c_dataset.formatter = formatter_name
c_dataset.dataset_name = dataset_name
c_dataset.path = dataset_path
c_dataset.meta_file_train = meta_file_train if meta_file_train else None
if meta_file_train is not None:
c_dataset.meta_file_train = meta_file_train
if meta_file_val is not None:
c_dataset.meta_file_val = meta_file_val
meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
if meta_data_eval is None:
@ -92,7 +96,7 @@ if __name__ == "__main__":
Example runs:
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
""",
formatter_class=RawTextHelpFormatter,
)
@ -139,9 +143,15 @@ if __name__ == "__main__":
default=None,
)
parser.add_argument(
"--metafile",
"--meta_file_train",
type=str,
help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--meta_file_val",
type=str,
help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
default=None,
)
args = parser.parse_args()
@ -155,7 +165,8 @@ if __name__ == "__main__":
formatter_name=args.formatter_name,
dataset_name=args.dataset_name,
dataset_path=args.dataset_path,
meta_file_train=args.metafile,
meta_file_train=args.meta_file_train,
meta_file_val=args.meta_file_val,
disable_cuda=args.disable_cuda,
no_eval=args.no_eval,
)

View File

@ -42,6 +42,9 @@ BATCH_SIZE = 32
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
SAMPLE_RATE = 16000
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
MAX_AUDIO_LEN_IN_SECONDS = 10
### Download VCTK dataset
VCTK_DOWNLOAD_PATH = os.path.join(CURRENT_PATH, "VCTK")
# Define the number of threads used during the audio resampling
@ -54,7 +57,7 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH):
# init configs
vctk_config = BaseDatasetConfig(
formatter="vctk", dataset_name="vctk", meta_file_train="", path=VCTK_DOWNLOAD_PATH, language="en"
formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en"
)
# Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :)
@ -84,6 +87,7 @@ for dataset_conf in DATASETS_CONFIG_LIST:
dataset_name=dataset_conf.dataset_name,
dataset_path=dataset_conf.path,
meta_file_train=dataset_conf.meta_file_train,
meta_file_val=dataset_conf.meta_file_val,
disable_cuda=False,
no_eval=False,
)
@ -153,7 +157,7 @@ config = VitsConfig(
start_by_longest=True,
datasets=DATASETS_CONFIG_LIST,
cudnn_benchmark=False,
max_audio_len=220500, # it should be: sampling rate * max audio in sec. So it is 22050 * 10 = 220500
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
mixed_precision=False,
test_sentences=[
[