Add dataloader test wit hall supported audio formats

2023-11-07 14:07:33 -03:00 · 2023-11-07 14:07:33 -03:00 · 775a81838c
parent d551a597e0
commit 775a81838c
68 changed files with 136 additions and 96 deletions
--- a/tests/data/ljspeech/metadata_flac.csv
+++ b/tests/data/ljspeech/metadata_flac.csv
@ -0,0 +1,9 @@
+audio_file|text|transcription|speaker_name
+wavs/LJ001-0001.flac|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
+wavs/LJ001-0002.flac|in being comparatively modern.|in being comparatively modern.|ljspeech-0
+wavs/LJ001-0003.flac|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
+wavs/LJ001-0004.flac|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
+wavs/LJ001-0005.flac|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
+wavs/LJ001-0006.flac|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
+wavs/LJ001-0007.flac|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
+wavs/LJ001-0008.flac|has never been surpassed.|has never been surpassed.|ljspeech-3
--- a/tests/data/ljspeech/metadata_mp3.csv
+++ b/tests/data/ljspeech/metadata_mp3.csv
@ -0,0 +1,9 @@
+audio_file|text|transcription|speaker_name
+wavs/LJ001-0001.mp3|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
+wavs/LJ001-0002.mp3|in being comparatively modern.|in being comparatively modern.|ljspeech-0
+wavs/LJ001-0003.mp3|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
+wavs/LJ001-0004.mp3|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
+wavs/LJ001-0005.mp3|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
+wavs/LJ001-0006.mp3|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
+wavs/LJ001-0007.mp3|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
+wavs/LJ001-0008.mp3|has never been surpassed.|has never been surpassed.|ljspeech-3
--- a/tests/data/ljspeech/metadata_wav.csv
+++ b/tests/data/ljspeech/metadata_wav.csv
@ -0,0 +1,9 @@
+audio_file|text|transcription|speaker_name
+wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
+wavs/LJ001-0002.wav|in being comparatively modern.|in being comparatively modern.|ljspeech-0
+wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
+wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
+wavs/LJ001-0005.wav|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
+wavs/LJ001-0006.wav|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
+wavs/LJ001-0007.wav|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
+wavs/LJ001-0008.wav|has never been surpassed.|has never been surpassed.|ljspeech-3
--- a/tests/data/ljspeech/wavs/LJ001-0001.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0001.flac
--- a/tests/data/ljspeech/wavs/LJ001-0001.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0001.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0002.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0002.flac
--- a/tests/data/ljspeech/wavs/LJ001-0002.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0002.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0003.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0003.flac
--- a/tests/data/ljspeech/wavs/LJ001-0003.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0003.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0004.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0004.flac
--- a/tests/data/ljspeech/wavs/LJ001-0004.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0004.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0005.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0005.flac
--- a/tests/data/ljspeech/wavs/LJ001-0005.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0005.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0006.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0006.flac
--- a/tests/data/ljspeech/wavs/LJ001-0006.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0006.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0007.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0007.flac
--- a/tests/data/ljspeech/wavs/LJ001-0007.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0007.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0008.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0008.flac
--- a/tests/data/ljspeech/wavs/LJ001-0008.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0008.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0009.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0009.flac
--- a/tests/data/ljspeech/wavs/LJ001-0009.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0009.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0010.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0010.flac
--- a/tests/data/ljspeech/wavs/LJ001-0010.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0010.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0011.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0011.flac
--- a/tests/data/ljspeech/wavs/LJ001-0011.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0011.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0012.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0012.flac
--- a/tests/data/ljspeech/wavs/LJ001-0012.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0012.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0013.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0013.flac
--- a/tests/data/ljspeech/wavs/LJ001-0013.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0013.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0014.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0014.flac
--- a/tests/data/ljspeech/wavs/LJ001-0014.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0014.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0015.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0015.flac
--- a/tests/data/ljspeech/wavs/LJ001-0015.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0015.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0016.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0016.flac
--- a/tests/data/ljspeech/wavs/LJ001-0016.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0016.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0017.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0017.flac
--- a/tests/data/ljspeech/wavs/LJ001-0017.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0017.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0018.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0018.flac
--- a/tests/data/ljspeech/wavs/LJ001-0018.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0018.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0019.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0019.flac
--- a/tests/data/ljspeech/wavs/LJ001-0019.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0019.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0020.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0020.flac
--- a/tests/data/ljspeech/wavs/LJ001-0020.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0020.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0021.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0021.flac
--- a/tests/data/ljspeech/wavs/LJ001-0021.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0021.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0022.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0022.flac
--- a/tests/data/ljspeech/wavs/LJ001-0022.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0022.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0023.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0023.flac
--- a/tests/data/ljspeech/wavs/LJ001-0023.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0023.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0024.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0024.flac
--- a/tests/data/ljspeech/wavs/LJ001-0024.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0024.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0025.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0025.flac
--- a/tests/data/ljspeech/wavs/LJ001-0025.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0025.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0026.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0026.flac
--- a/tests/data/ljspeech/wavs/LJ001-0026.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0026.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0027.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0027.flac
--- a/tests/data/ljspeech/wavs/LJ001-0027.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0027.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0028.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0028.flac
--- a/tests/data/ljspeech/wavs/LJ001-0028.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0028.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0029.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0029.flac
--- a/tests/data/ljspeech/wavs/LJ001-0029.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0029.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0030.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0030.flac
--- a/tests/data/ljspeech/wavs/LJ001-0030.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0030.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0031.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0031.flac
--- a/tests/data/ljspeech/wavs/LJ001-0031.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0031.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0032.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0032.flac
--- a/tests/data/ljspeech/wavs/LJ001-0032.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0032.mp3
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@ -21,15 +21,30 @@ os.makedirs(OUTPATH, exist_ok=True)
 c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
 c.r = 5
 c.data_path = os.path.join(get_tests_data_path(), "ljspeech/")
-ok_ljspeech = os.path.exists(c.data_path)

-dataset_config = BaseDatasetConfig(
-    formatter="ljspeech_test",  # ljspeech_test to multi-speaker
-    meta_file_train="metadata.csv",
+dataset_config_wav = BaseDatasetConfig(
+    formatter="coqui",  # ljspeech_test to multi-speaker
+    meta_file_train="metadata_wav.csv",
    meta_file_val=None,
    path=c.data_path,
    language="en",
 )
+dataset_config_mp3 = BaseDatasetConfig(
+    formatter="coqui",  # ljspeech_test to multi-speaker
+    meta_file_train="metadata_mp3.csv",
+    meta_file_val=None,
+    path=c.data_path,
+    language="en",
+)
+dataset_config_flac = BaseDatasetConfig(
+    formatter="coqui",  # ljspeech_test to multi-speaker
+    meta_file_train="metadata_flac.csv",
+    meta_file_val=None,
+    path=c.data_path,
+    language="en",
+)
+
+dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac]

 DATA_EXIST = True
 if not os.path.exists(c.data_path):
@ -44,11 +59,10 @@ class TestTTSDataset(unittest.TestCase):
        self.max_loader_iter = 4
        self.ap = AudioProcessor(**c.audio)

-    def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):
+    def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
        # load dataset
        meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
        items = meta_data_train + meta_data_eval
-
        tokenizer, _ = TTSTokenizer.init_from_config(c)
        dataset = TTSDataset(
            outputs_per_step=r,
@ -64,6 +78,11 @@ class TestTTSDataset(unittest.TestCase):
            max_audio_len=c.max_audio_len,
            start_by_longest=start_by_longest,
        )
+
+        # add preprocess to force the length computation
+        if preprocess_samples:
+            dataset.preprocess_samples()
+
        dataloader = DataLoader(
            dataset,
            batch_size=batch_size,
@ -75,9 +94,8 @@ class TestTTSDataset(unittest.TestCase):
        return dataloader, dataset

    def test_loader(self):
-        if ok_ljspeech:
-            dataloader, dataset = self._create_dataloader(1, 1, 0)
-
+        for dataset_config in dataset_configs:
+            dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True)
            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
@ -104,8 +122,6 @@ class TestTTSDataset(unittest.TestCase):

                # make sure that the computed mels and the waveform match and correctly computed
                mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
-                # remove padding in mel-spectrogram
-                mel_dataloader = mel_input[0].T.numpy()[:, : mel_lengths[0]]
                # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
                mel_new = mel_new[:, : mel_lengths[0]]
                ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
@ -124,8 +140,7 @@ class TestTTSDataset(unittest.TestCase):
                    self.assertGreaterEqual(mel_input.min(), 0)

    def test_batch_group_shuffle(self):
-        if ok_ljspeech:
-            dataloader, dataset = self._create_dataloader(2, c.r, 16)
+        dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav)
        last_length = 0
        frames = dataset.samples
        for i, data in enumerate(dataloader):
@ -147,8 +162,7 @@ class TestTTSDataset(unittest.TestCase):

        Ther first item of the fist batch must be longer than all the other items.
        """
-        if ok_ljspeech:
-            dataloader, _ = self._create_dataloader(2, c.r, 0, True)
+        dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
        dataloader.dataset.preprocess_samples()
        for i, data in enumerate(dataloader):
            if i == self.max_loader_iter:
@ -172,8 +186,7 @@ class TestTTSDataset(unittest.TestCase):
            self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0])
            self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0])

-        if ok_ljspeech:
-            dataloader, _ = self._create_dataloader(1, 1, 0)
+        dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav)

        for i, data in enumerate(dataloader):
            if i == self.max_loader_iter:
@ -210,7 +223,7 @@ class TestTTSDataset(unittest.TestCase):
            check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)

        # Test for batch size 2
-            dataloader, _ = self._create_dataloader(2, 1, 0)
+        dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav)

        for i, data in enumerate(dataloader):
            if i == self.max_loader_iter: