From 28a746497560dc3f1f3415827ef38d7d9d72dbbf Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Mon, 21 Feb 2022 05:59:36 -0300
Subject: [PATCH] Fix the bug in split dataset function (#1251)

* Fix the bug in split_dataset

* Make eval_split_size configurable

* Change test_loader to use load_tts_samples function

* Change eval_split_portion to eval_split_size and permits to set the absolute number of samples in eval

* Fix samplers unit test

* Add data unit test on GitHub workflow
---
 .github/workflows/data_tests.yml    | 46 +++++++++++++++++++++++++++++
 Makefile                            |  3 ++
 TTS/bin/extract_tts_spectrograms.py |  2 +-
 TTS/bin/find_unique_chars.py        |  2 +-
 TTS/bin/find_unique_phonemes.py     |  2 +-
 TTS/bin/train_tts.py                |  2 +-
 TTS/tts/configs/shared_configs.py   | 10 +++++++
 TTS/tts/datasets/__init__.py        | 41 +++++++++++++++++++------
 TTS/tts/datasets/formatters.py      |  6 +++-
 tests/data_tests/test_loader.py     | 27 ++++++++++++-----
 tests/data_tests/test_samplers.py   |  4 +--
 11 files changed, 121 insertions(+), 24 deletions(-)
 create mode 100644 .github/workflows/data_tests.yml

diff --git a/.github/workflows/data_tests.yml b/.github/workflows/data_tests.yml
new file mode 100644
index 00000000..296aa570
--- /dev/null
+++ b/.github/workflows/data_tests.yml
@@ -0,0 +1,46 @@
+name: data-tests
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  check_skip:
+    runs-on: ubuntu-latest
+    if: "! contains(github.event.head_commit.message, '[ci skip]')"
+    steps:
+      - run: echo "${{ github.event.head_commit.message }}"
+
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.6, 3.7, 3.8, 3.9]
+        experimental: [false]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: 'requirements*'
+      - name: check OS
+        run: cat /etc/os-release
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends git make gcc
+          make system-deps
+      - name: Install/upgrade Python setup deps
+        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Install TTS
+        run: |
+          python3 -m pip install .[all]
+          python3 setup.py egg_info
+      - name: Unit tests
+        run: make data_tests
diff --git a/Makefile b/Makefile
index 2632dbab..6752fa04 100644
--- a/Makefile
+++ b/Makefile
@@ -26,6 +26,9 @@ test_aux:	## run aux tests.
 test_zoo:	## run zoo tests.
 	nosetests tests.zoo_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.zoo_tests --nologcapture --with-id
 
+data_tests:	## run data tests.
+	nosetests tests.data_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.data_tests --nologcapture --with-id
+
 test_failed:  ## only run tests failed the last time.
 	nosetests -x --with-cov -cov  --cover-erase --cover-package TTS tests --nologcapture --failed
 
diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
index 7b489fd6..e21f57c9 100755
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -229,7 +229,7 @@ def main(args):  # pylint: disable=redefined-outer-name
     ap = AudioProcessor(**c.audio)
 
     # load data instances
-    meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=args.eval)
+    meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size)
 
     # use eval and training partitions
     meta_data = meta_data_train + meta_data_eval
diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py
index fb98bab5..541e971b 100644
--- a/TTS/bin/find_unique_chars.py
+++ b/TTS/bin/find_unique_chars.py
@@ -23,7 +23,7 @@ def main():
     c = load_config(args.config_path)
 
     # load all datasets
-    train_items, eval_items = load_tts_samples(c.datasets, eval_split=True)
+    train_items, eval_items = load_tts_samples(c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size)
 
     items = train_items + eval_items
 
diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py
index 02a783c7..ad567434 100644
--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@@ -39,7 +39,7 @@ def main():
     c = load_config(args.config_path)
 
     # load all datasets
-    train_items, eval_items = load_tts_samples(c.datasets, eval_split=True)
+    train_items, eval_items = load_tts_samples(c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size)
     items = train_items + eval_items
     print("Num items:", len(items))
 
diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
index a7ce8ef3..16251fdd 100644
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tts.py
@@ -42,7 +42,7 @@ def main():
             config = register_config(config_base.model)()
 
     # load training samples
-    train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True)
+    train_samples, eval_samples = load_tts_samples(config.datasets, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
     # setup audio processor
     ap = AudioProcessor(**config.audio)
diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index 60ef7276..65ed21de 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -183,6 +183,13 @@ class BaseTTSConfig(BaseTrainingConfig):
 
         test_sentences (List[str]):
             List of sentences to be used at testing. Defaults to '[]'
+
+        eval_split_max_size (int):
+            Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+
+        eval_split_size (float):
+            If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+            If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
     """
 
     audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
@@ -218,3 +225,6 @@ class BaseTTSConfig(BaseTrainingConfig):
     lr_scheduler_params: dict = field(default_factory=lambda: {})
     # testing
     test_sentences: List[str] = field(default_factory=lambda: [])
+    # evaluation
+    eval_split_max_size: int = None
+    eval_split_size: float = 0.01
diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index 455413fa..d80e92c9 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -9,25 +9,40 @@ from TTS.tts.datasets.dataset import *
 from TTS.tts.datasets.formatters import *
 
 
-def split_dataset(items):
+def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
     """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
 
     Args:
-        items (List[List]): A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
+        items (List[List]):
+            A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
+
+        eval_split_max_size (int):
+            Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+
+        eval_split_size (float):
+            If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+            If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
     """
-    speakers = [item[-1] for item in items]
+    speakers = [item["speaker_name"] for item in items]
     is_multi_speaker = len(set(speakers)) > 1
-    eval_split_size = min(500, int(len(items) * 0.01))
-    assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples."
+    if eval_split_size > 1:
+        eval_split_size = int(eval_split_size)
+    else:
+        if eval_split_max_size:
+            eval_split_size = min(eval_split_max_size, int(len(items) * eval_split_size))
+        else:
+            eval_split_size = int(len(items) * eval_split_size)
+
+    assert eval_split_size > 0, " [!] You do not have enough samples for the evaluation set. You can work around this setting the 'eval_split_size' parameter to a minimum of {}".format(1/len(items))
     np.random.seed(0)
     np.random.shuffle(items)
     if is_multi_speaker:
         items_eval = []
-        speakers = [item[-1] for item in items]
+        speakers = [item["speaker_name"] for item in items]
         speaker_counter = Counter(speakers)
         while len(items_eval) < eval_split_size:
             item_idx = np.random.randint(0, len(items))
-            speaker_to_be_removed = items[item_idx][-1]
+            speaker_to_be_removed = items[item_idx]["speaker_name"]
             if speaker_counter[speaker_to_be_removed] > 1:
                 items_eval.append(items[item_idx])
                 speaker_counter[speaker_to_be_removed] -= 1
@@ -37,7 +52,8 @@ def split_dataset(items):
 
 
 def load_tts_samples(
-    datasets: Union[List[Dict], Dict], eval_split=True, formatter: Callable = None
+    datasets: Union[List[Dict], Dict], eval_split=True, formatter: Callable = None,
+    eval_split_max_size=None, eval_split_size=0.01
 ) -> Tuple[List[List], List[List]]:
     """Parse the dataset from the datasets config, load the samples as a List and load the attention alignments if provided.
     If `formatter` is not None, apply the formatter to the samples else pick the formatter from the available ones based
@@ -55,6 +71,13 @@ def load_tts_samples(
             `[[audio_path, text, speaker_id], ...]]`. See the available formatters in `TTS.tts.dataset.formatter` as
             example. Defaults to None.
 
+        eval_split_max_size (int):
+            Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+
+        eval_split_size (float):
+            If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+            If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
+
     Returns:
         Tuple[List[List], List[List]: training and evaluation splits of the dataset.
     """
@@ -84,7 +107,7 @@ def load_tts_samples(
                 meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
                 meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval]
             else:
-                meta_data_eval, meta_data_train = split_dataset(meta_data_train)
+                meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
             meta_data_eval_all += meta_data_eval
         meta_data_train_all += meta_data_train
         # load attention masks for the duration predictor training
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 28eb0e0f..5cbc93db 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -129,11 +129,15 @@ def ljspeech_test(root_path, meta_file, **kwargs):  # pylint: disable=unused-arg
     txt_file = os.path.join(root_path, meta_file)
     items = []
     with open(txt_file, "r", encoding="utf-8") as ttf:
+        speaker_id = 0
         for idx, line in enumerate(ttf):
+            # 2 samples per speaker to avoid eval split issues
+            if idx%2 == 0:
+                speaker_id += 1
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
             text = cols[2]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": f"ljspeech-{idx}"})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": f"ljspeech-{speaker_id}"})
     return items
 
 
diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py
index 19c2e8f7..d210995d 100644
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@@ -8,8 +8,8 @@ from torch.utils.data import DataLoader
 
 from tests import get_tests_output_path
 from TTS.tts.configs.shared_configs import BaseTTSConfig
-from TTS.tts.datasets import TTSDataset
-from TTS.tts.datasets.formatters import ljspeech
+from TTS.tts.datasets import TTSDataset, load_tts_samples
+from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.utils.audio import AudioProcessor
 
 # pylint: disable=unused-variable
@@ -18,11 +18,19 @@ OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
 os.makedirs(OUTPATH, exist_ok=True)
 
 # create a dummy config for testing data loaders.
-c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2)
+c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
 c.r = 5
 c.data_path = "tests/data/ljspeech/"
 ok_ljspeech = os.path.exists(c.data_path)
 
+dataset_config = BaseDatasetConfig(
+    name="ljspeech_test", # ljspeech_test to multi-speaker
+    meta_file_train="metadata.csv",
+    meta_file_val=None,
+    path=c.data_path,
+    language="en",
+)
+
 DATA_EXIST = True
 if not os.path.exists(c.data_path):
     DATA_EXIST = False
@@ -37,11 +45,10 @@ class TestTTSDataset(unittest.TestCase):
         self.ap = AudioProcessor(**c.audio)
 
     def _create_dataloader(self, batch_size, r, bgs):
-        items = ljspeech(c.data_path, "metadata.csv")
 
-        # add a default language because now the TTSDataset expect a language
-        language = ""
-        items = [[*item, language] for item in items]
+        # load dataset
+        meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
+        items = meta_data_train + meta_data_eval
 
         dataset = TTSDataset(
             r,
@@ -97,8 +104,12 @@ class TestTTSDataset(unittest.TestCase):
 
                 # make sure that the computed mels and the waveform match and correctly computed
                 mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
+                # remove padding in mel-spectrogram
+                mel_dataloader = mel_input[0].T.numpy()[:, :mel_lengths[0]]
+                # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
+                mel_new = mel_new[:, :mel_lengths[0]]
                 ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
-                mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
+                mel_diff = (mel_new - mel_dataloader)[:, 0:ignore_seg]
                 assert abs(mel_diff.sum()) < 1e-5
 
                 # check normalization ranges
diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py
index 3d8d6c75..497a3fb5 100644
--- a/tests/data_tests/test_samplers.py
+++ b/tests/data_tests/test_samplers.py
@@ -39,7 +39,7 @@ random_sampler = torch.utils.data.RandomSampler(train_samples)
 ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)])
 en, pt = 0, 0
 for index in ids:
-    if train_samples[index][3] == "en":
+    if train_samples[index]["language"] == "en":
         en += 1
     else:
         pt += 1
@@ -50,7 +50,7 @@ weighted_sampler = get_language_weighted_sampler(train_samples)
 ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
 en, pt = 0, 0
 for index in ids:
-    if train_samples[index][3] == "en":
+    if train_samples[index]["language"] == "en":
         en += 1
     else:
         pt += 1