From a066e14fb1cf815268214fa08c42768c7eee3884 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Sat, 10 Dec 2022 13:49:57 -0300
Subject: [PATCH] Add parameter for eval metadata file on compute embeddings
 function

---
 TTS/bin/compute_embeddings.py         | 21 ++++++++++++++++-----
 recipes/vctk/yourtts/train_yourtts.py |  8 ++++++--
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index ace6deef..7e0932cc 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -22,6 +22,7 @@ def compute_embeddings(
     dataset_name=None,
     dataset_path=None,
     meta_file_train=None,
+    meta_file_val=None,
     disable_cuda=False,
     no_eval=False,
 ):
@@ -35,7 +36,10 @@ def compute_embeddings(
         c_dataset.formatter = formatter_name
         c_dataset.dataset_name = dataset_name
         c_dataset.path = dataset_path
-        c_dataset.meta_file_train = meta_file_train if meta_file_train else None
+        if meta_file_train is not None:
+            c_dataset.meta_file_train = meta_file_train
+        if meta_file_val is not None:
+            c_dataset.meta_file_val = meta_file_val
         meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
 
     if meta_data_eval is None:
@@ -92,7 +96,7 @@ if __name__ == "__main__":
         Example runs:
         python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
 
-        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv
+        python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
         """,
         formatter_class=RawTextHelpFormatter,
     )
@@ -139,9 +143,15 @@ if __name__ == "__main__":
         default=None,
     )
     parser.add_argument(
-        "--metafile",
+        "--meta_file_train",
         type=str,
-        help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+        default=None,
+    )
+    parser.add_argument(
+        "--meta_file_val",
+        type=str,
+        help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
         default=None,
     )
     args = parser.parse_args()
@@ -155,7 +165,8 @@ if __name__ == "__main__":
         formatter_name=args.formatter_name,
         dataset_name=args.dataset_name,
         dataset_path=args.dataset_path,
-        meta_file_train=args.metafile,
+        meta_file_train=args.meta_file_train,
+        meta_file_val=args.meta_file_val,
         disable_cuda=args.disable_cuda,
         no_eval=args.no_eval,
     )
diff --git a/recipes/vctk/yourtts/train_yourtts.py b/recipes/vctk/yourtts/train_yourtts.py
index b226880e..1487a9fc 100644
--- a/recipes/vctk/yourtts/train_yourtts.py
+++ b/recipes/vctk/yourtts/train_yourtts.py
@@ -42,6 +42,9 @@ BATCH_SIZE = 32
 # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
 SAMPLE_RATE = 16000
 
+# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
+MAX_AUDIO_LEN_IN_SECONDS = 10
+
 ### Download VCTK dataset
 VCTK_DOWNLOAD_PATH = os.path.join(CURRENT_PATH, "VCTK")
 # Define the number of threads used during the audio resampling
@@ -54,7 +57,7 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH):
 
 # init configs
 vctk_config = BaseDatasetConfig(
-    formatter="vctk", dataset_name="vctk", meta_file_train="", path=VCTK_DOWNLOAD_PATH, language="en"
+    formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en"
 )
 
 # Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :)
@@ -84,6 +87,7 @@ for dataset_conf in DATASETS_CONFIG_LIST:
             dataset_name=dataset_conf.dataset_name,
             dataset_path=dataset_conf.path,
             meta_file_train=dataset_conf.meta_file_train,
+            meta_file_val=dataset_conf.meta_file_val,
             disable_cuda=False,
             no_eval=False,
         )
@@ -153,7 +157,7 @@ config = VitsConfig(
     start_by_longest=True,
     datasets=DATASETS_CONFIG_LIST,
     cudnn_benchmark=False,
-    max_audio_len=220500,  # it should be: sampling rate * max audio in sec. So it is 22050 * 10 = 220500
+    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
     mixed_precision=False,
     test_sentences=[
         [