Add dataloader test wit hall supported audio formats

This commit is contained in:
Edresson Casanova 2023-11-07 14:07:33 -03:00
parent d551a597e0
commit 775a81838c
68 changed files with 136 additions and 96 deletions

View File

@ -0,0 +1,9 @@
audio_file|text|transcription|speaker_name
wavs/LJ001-0001.flac|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
wavs/LJ001-0002.flac|in being comparatively modern.|in being comparatively modern.|ljspeech-0
wavs/LJ001-0003.flac|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
wavs/LJ001-0004.flac|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
wavs/LJ001-0005.flac|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
wavs/LJ001-0006.flac|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
wavs/LJ001-0007.flac|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
wavs/LJ001-0008.flac|has never been surpassed.|has never been surpassed.|ljspeech-3
Can't render this file because it contains an unexpected character in line 8 and column 86.

View File

@ -0,0 +1,9 @@
audio_file|text|transcription|speaker_name
wavs/LJ001-0001.mp3|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
wavs/LJ001-0002.mp3|in being comparatively modern.|in being comparatively modern.|ljspeech-0
wavs/LJ001-0003.mp3|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
wavs/LJ001-0004.mp3|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
wavs/LJ001-0005.mp3|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
wavs/LJ001-0006.mp3|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
wavs/LJ001-0007.mp3|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
wavs/LJ001-0008.mp3|has never been surpassed.|has never been surpassed.|ljspeech-3
Can't render this file because it contains an unexpected character in line 8 and column 85.

View File

@ -0,0 +1,9 @@
audio_file|text|transcription|speaker_name
wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
wavs/LJ001-0002.wav|in being comparatively modern.|in being comparatively modern.|ljspeech-0
wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
wavs/LJ001-0005.wav|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
wavs/LJ001-0006.wav|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
wavs/LJ001-0007.wav|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
wavs/LJ001-0008.wav|has never been surpassed.|has never been surpassed.|ljspeech-3
Can't render this file because it contains an unexpected character in line 8 and column 85.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -21,15 +21,30 @@ os.makedirs(OUTPATH, exist_ok=True)
c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
c.r = 5
c.data_path = os.path.join(get_tests_data_path(), "ljspeech/")
ok_ljspeech = os.path.exists(c.data_path)
dataset_config = BaseDatasetConfig(
formatter="ljspeech_test", # ljspeech_test to multi-speaker
meta_file_train="metadata.csv",
dataset_config_wav = BaseDatasetConfig(
formatter="coqui", # ljspeech_test to multi-speaker
meta_file_train="metadata_wav.csv",
meta_file_val=None,
path=c.data_path,
language="en",
)
dataset_config_mp3 = BaseDatasetConfig(
formatter="coqui", # ljspeech_test to multi-speaker
meta_file_train="metadata_mp3.csv",
meta_file_val=None,
path=c.data_path,
language="en",
)
dataset_config_flac = BaseDatasetConfig(
formatter="coqui", # ljspeech_test to multi-speaker
meta_file_train="metadata_flac.csv",
meta_file_val=None,
path=c.data_path,
language="en",
)
dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac]
DATA_EXIST = True
if not os.path.exists(c.data_path):
@ -44,11 +59,10 @@ class TestTTSDataset(unittest.TestCase):
self.max_loader_iter = 4
self.ap = AudioProcessor(**c.audio)
def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):
def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
# load dataset
meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
items = meta_data_train + meta_data_eval
tokenizer, _ = TTSTokenizer.init_from_config(c)
dataset = TTSDataset(
outputs_per_step=r,
@ -64,6 +78,11 @@ class TestTTSDataset(unittest.TestCase):
max_audio_len=c.max_audio_len,
start_by_longest=start_by_longest,
)
# add preprocess to force the length computation
if preprocess_samples:
dataset.preprocess_samples()
dataloader = DataLoader(
dataset,
batch_size=batch_size,
@ -75,9 +94,8 @@ class TestTTSDataset(unittest.TestCase):
return dataloader, dataset
def test_loader(self):
if ok_ljspeech:
dataloader, dataset = self._create_dataloader(1, 1, 0)
for dataset_config in dataset_configs:
dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True)
for i, data in enumerate(dataloader):
if i == self.max_loader_iter:
break
@ -104,8 +122,6 @@ class TestTTSDataset(unittest.TestCase):
# make sure that the computed mels and the waveform match and correctly computed
mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
# remove padding in mel-spectrogram
mel_dataloader = mel_input[0].T.numpy()[:, : mel_lengths[0]]
# guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
mel_new = mel_new[:, : mel_lengths[0]]
ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
@ -124,8 +140,7 @@ class TestTTSDataset(unittest.TestCase):
self.assertGreaterEqual(mel_input.min(), 0)
def test_batch_group_shuffle(self):
if ok_ljspeech:
dataloader, dataset = self._create_dataloader(2, c.r, 16)
dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav)
last_length = 0
frames = dataset.samples
for i, data in enumerate(dataloader):
@ -147,8 +162,7 @@ class TestTTSDataset(unittest.TestCase):
Ther first item of the fist batch must be longer than all the other items.
"""
if ok_ljspeech:
dataloader, _ = self._create_dataloader(2, c.r, 0, True)
dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
dataloader.dataset.preprocess_samples()
for i, data in enumerate(dataloader):
if i == self.max_loader_iter:
@ -172,8 +186,7 @@ class TestTTSDataset(unittest.TestCase):
self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0])
self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0])
if ok_ljspeech:
dataloader, _ = self._create_dataloader(1, 1, 0)
dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav)
for i, data in enumerate(dataloader):
if i == self.max_loader_iter:
@ -210,7 +223,7 @@ class TestTTSDataset(unittest.TestCase):
check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
# Test for batch size 2
dataloader, _ = self._create_dataloader(2, 1, 0)
dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav)
for i, data in enumerate(dataloader):
if i == self.max_loader_iter: