Add parameter for eval metadata file on compute embeddings function

2022-12-10 13:49:57 -03:00 · 2022-12-10 13:49:57 -03:00 · a066e14fb1
parent d7c2a8e834
commit a066e14fb1
2 changed files with 22 additions and 7 deletions
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -22,6 +22,7 @@ def compute_embeddings(
    dataset_name=None,
    dataset_path=None,
    meta_file_train=None,
+    meta_file_val=None,
    disable_cuda=False,
    no_eval=False,
 ):
@ -35,7 +36,10 @@ def compute_embeddings(
        c_dataset.formatter = formatter_name
        c_dataset.dataset_name = dataset_name
        c_dataset.path = dataset_path
-        c_dataset.meta_file_train = meta_file_train if meta_file_train else None
+        if meta_file_train is not None:
+            c_dataset.meta_file_train = meta_file_train
+        if meta_file_val is not None:
+            c_dataset.meta_file_val = meta_file_val
        meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)

    if meta_data_eval is None:
@ -92,7 +96,7 @@ if __name__ == "__main__":
        Example runs:
        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json

-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
        """,
        formatter_class=RawTextHelpFormatter,
    )
@ -139,9 +143,15 @@ if __name__ == "__main__":
        default=None,
    )
    parser.add_argument(
-        "--metafile",
+        "--meta_file_train",
        type=str,
-        help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_val",
+        type=str,
+        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
        default=None,
    )
    args = parser.parse_args()
@ -155,7 +165,8 @@ if __name__ == "__main__":
        formatter_name=args.formatter_name,
        dataset_name=args.dataset_name,
        dataset_path=args.dataset_path,
-        meta_file_train=args.metafile,
+        meta_file_train=args.meta_file_train,
+        meta_file_val=args.meta_file_val,
        disable_cuda=args.disable_cuda,
        no_eval=args.no_eval,
    )
--- a/recipes/vctk/yourtts/train_yourtts.py
+++ b/recipes/vctk/yourtts/train_yourtts.py
@ -42,6 +42,9 @@ BATCH_SIZE = 32
 # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
 SAMPLE_RATE = 16000

+# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
+MAX_AUDIO_LEN_IN_SECONDS = 10
+
 ### Download VCTK dataset
 VCTK_DOWNLOAD_PATH = os.path.join(CURRENT_PATH, "VCTK")
 # Define the number of threads used during the audio resampling
@ -54,7 +57,7 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH):

 # init configs
 vctk_config = BaseDatasetConfig(
-    formatter="vctk", dataset_name="vctk", meta_file_train="", path=VCTK_DOWNLOAD_PATH, language="en"
+    formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en"
 )

 # Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :)
@ -84,6 +87,7 @@ for dataset_conf in DATASETS_CONFIG_LIST:
            dataset_name=dataset_conf.dataset_name,
            dataset_path=dataset_conf.path,
            meta_file_train=dataset_conf.meta_file_train,
+            meta_file_val=dataset_conf.meta_file_val,
            disable_cuda=False,
            no_eval=False,
        )
@ -153,7 +157,7 @@ config = VitsConfig(
    start_by_longest=True,
    datasets=DATASETS_CONFIG_LIST,
    cudnn_benchmark=False,
-    max_audio_len=220500,  # it should be: sampling rate * max audio in sec. So it is 22050 * 10 = 220500
+    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
    mixed_precision=False,
    test_sentences=[
        [