74 changed files with 109 additions and 158 deletions
--- a/README.md
+++ b/README.md
@ -7,6 +7,9 @@
 - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
 - 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
+- 📣 Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
+- 📣 Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
+- 📣 Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).

 <div align="center">
 <img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@ -125,7 +125,7 @@ def evaluation(model, criterion, data_loader, global_step):

 def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
    model.train()
-    best_loss = {"train_loss": None, "eval_loss": float("inf")}
+    best_loss = float("inf")
    avg_loader_time = 0
    end_time = time.time()
    for epoch in range(c.epochs):
@ -248,7 +248,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
            )
            # save the best checkpoint
            best_loss = save_best_model(
-                {"train_loss": None, "eval_loss": eval_loss},
+                eval_loss,
                best_loss,
                c,
                model,
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@ -13,8 +13,6 @@ from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy

-import mutagen
-
 # to prevent too many open files error as suggested here
 # https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
 torch.multiprocessing.set_sharing_strategy("file_system")
@ -44,15 +42,6 @@ def string2filename(string):
    return filename


-def get_audio_size(audiopath):
-    extension = audiopath.rpartition(".")[-1].lower()
-    if extension not in {"mp3", "wav", "flac"}:
-        raise RuntimeError(f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!")
-
-    audio_info = mutagen.File(audiopath).info
-    return int(audio_info.length * audio_info.sample_rate)
-
-
 class TTSDataset(Dataset):
    def __init__(
        self,
@ -187,7 +176,7 @@ class TTSDataset(Dataset):
        lens = []
        for item in self.samples:
            _, wav_file, *_ = _parse_sample(item)
-            audio_len = get_audio_size(wav_file)
+            audio_len = os.path.getsize(wav_file) / 16 * 8  # assuming 16bit audio
            lens.append(audio_len)
        return lens

@ -306,7 +295,7 @@ class TTSDataset(Dataset):
    def _compute_lengths(samples):
        new_samples = []
        for item in samples:
-            audio_length = get_audio_size(item["audio_file"])
+            audio_length = os.path.getsize(item["audio_file"]) / 16 * 8  # assuming 16bit audio
            text_lenght = len(item["text"])
            item["audio_length"] = audio_length
            item["text_length"] = text_lenght
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -756,13 +756,11 @@ class Xtts(BaseTTS):

        model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth")
        vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json")
-
-        if speaker_file_path is None and checkpoint_dir is not None:
-            speaker_file_path = os.path.join(checkpoint_dir, "speakers_xtts.pth")
+        speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth")

        self.language_manager = LanguageManager(config)
        self.speaker_manager = None
-        if speaker_file_path is not None and os.path.exists(speaker_file_path):
+        if os.path.exists(speaker_file_path):
            self.speaker_manager = SpeakerManager(speaker_file_path)

        if os.path.exists(vocab_path):
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@ -36,7 +36,9 @@ def get_git_branch():
        current.replace("* ", "")
    except subprocess.CalledProcessError:
        current = "inside_docker"
-    except (FileNotFoundError, StopIteration) as e:
+    except FileNotFoundError:
+        current = "unknown"
+    except StopIteration:
        current = "unknown"
    return current

--- a/requirements.txt
+++ b/requirements.txt
@ -17,7 +17,6 @@ pyyaml>=6.0
 fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail
 aiohttp>=3.8.1
 packaging>=23.1
-mutagen==1.47.0
 # deps for examples
 flask>=2.0.1
 # deps for inference
@ -28,7 +27,7 @@ pandas>=1.4,<2.0
 # deps for training
 matplotlib>=3.7.0
 # coqui stack
-trainer>=0.0.36
+trainer>=0.0.32
 # config management
 coqpit>=0.0.16
 # chinese g2p deps
--- a/tests/data/ljspeech/metadata_flac.csv
+++ b/tests/data/ljspeech/metadata_flac.csv
@ -1,9 +0,0 @@
-audio_file|text|transcription|speaker_name
-wavs/LJ001-0001.flac|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
-wavs/LJ001-0002.flac|in being comparatively modern.|in being comparatively modern.|ljspeech-0
-wavs/LJ001-0003.flac|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
-wavs/LJ001-0004.flac|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
-wavs/LJ001-0005.flac|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
-wavs/LJ001-0006.flac|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
-wavs/LJ001-0007.flac|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
-wavs/LJ001-0008.flac|has never been surpassed.|has never been surpassed.|ljspeech-3
--- a/tests/data/ljspeech/metadata_mp3.csv
+++ b/tests/data/ljspeech/metadata_mp3.csv
@ -1,9 +0,0 @@
-audio_file|text|transcription|speaker_name
-wavs/LJ001-0001.mp3|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
-wavs/LJ001-0002.mp3|in being comparatively modern.|in being comparatively modern.|ljspeech-0
-wavs/LJ001-0003.mp3|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
-wavs/LJ001-0004.mp3|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
-wavs/LJ001-0005.mp3|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
-wavs/LJ001-0006.mp3|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
-wavs/LJ001-0007.mp3|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
-wavs/LJ001-0008.mp3|has never been surpassed.|has never been surpassed.|ljspeech-3
--- a/tests/data/ljspeech/metadata_wav.csv
+++ b/tests/data/ljspeech/metadata_wav.csv
@ -1,9 +0,0 @@
-audio_file|text|transcription|speaker_name
-wavs/LJ001-0001.wav|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition|ljspeech-0
-wavs/LJ001-0002.wav|in being comparatively modern.|in being comparatively modern.|ljspeech-0
-wavs/LJ001-0003.wav|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process|ljspeech-1
-wavs/LJ001-0004.wav|produced the block books, which were the immediate predecessors of the true printed book,|produced the block books, which were the immediate predecessors of the true printed book,|ljspeech-1
-wavs/LJ001-0005.wav|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.|ljspeech-2
-wavs/LJ001-0006.wav|And it is worth mention in passing that, as an example of fine typography,|And it is worth mention in passing that, as an example of fine typography,|ljspeech-2
-wavs/LJ001-0007.wav|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about 1455,|the earliest book printed with movable types, the Gutenberg, or "forty-two line Bible" of about fourteen fifty-five,|ljspeech-3
-wavs/LJ001-0008.wav|has never been surpassed.|has never been surpassed.|ljspeech-3
--- a/tests/data/ljspeech/wavs/LJ001-0001.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0001.flac
--- a/tests/data/ljspeech/wavs/LJ001-0001.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0001.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0002.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0002.flac
--- a/tests/data/ljspeech/wavs/LJ001-0002.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0002.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0003.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0003.flac
--- a/tests/data/ljspeech/wavs/LJ001-0003.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0003.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0004.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0004.flac
--- a/tests/data/ljspeech/wavs/LJ001-0004.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0004.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0005.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0005.flac
--- a/tests/data/ljspeech/wavs/LJ001-0005.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0005.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0006.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0006.flac
--- a/tests/data/ljspeech/wavs/LJ001-0006.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0006.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0007.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0007.flac
--- a/tests/data/ljspeech/wavs/LJ001-0007.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0007.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0008.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0008.flac
--- a/tests/data/ljspeech/wavs/LJ001-0008.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0008.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0009.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0009.flac
--- a/tests/data/ljspeech/wavs/LJ001-0009.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0009.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0010.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0010.flac
--- a/tests/data/ljspeech/wavs/LJ001-0010.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0010.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0011.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0011.flac
--- a/tests/data/ljspeech/wavs/LJ001-0011.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0011.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0012.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0012.flac
--- a/tests/data/ljspeech/wavs/LJ001-0012.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0012.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0013.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0013.flac
--- a/tests/data/ljspeech/wavs/LJ001-0013.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0013.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0014.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0014.flac
--- a/tests/data/ljspeech/wavs/LJ001-0014.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0014.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0015.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0015.flac
--- a/tests/data/ljspeech/wavs/LJ001-0015.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0015.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0016.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0016.flac
--- a/tests/data/ljspeech/wavs/LJ001-0016.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0016.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0017.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0017.flac
--- a/tests/data/ljspeech/wavs/LJ001-0017.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0017.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0018.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0018.flac
--- a/tests/data/ljspeech/wavs/LJ001-0018.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0018.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0019.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0019.flac
--- a/tests/data/ljspeech/wavs/LJ001-0019.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0019.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0020.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0020.flac
--- a/tests/data/ljspeech/wavs/LJ001-0020.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0020.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0021.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0021.flac
--- a/tests/data/ljspeech/wavs/LJ001-0021.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0021.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0022.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0022.flac
--- a/tests/data/ljspeech/wavs/LJ001-0022.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0022.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0023.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0023.flac
--- a/tests/data/ljspeech/wavs/LJ001-0023.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0023.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0024.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0024.flac
--- a/tests/data/ljspeech/wavs/LJ001-0024.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0024.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0025.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0025.flac
--- a/tests/data/ljspeech/wavs/LJ001-0025.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0025.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0026.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0026.flac
--- a/tests/data/ljspeech/wavs/LJ001-0026.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0026.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0027.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0027.flac
--- a/tests/data/ljspeech/wavs/LJ001-0027.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0027.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0028.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0028.flac
--- a/tests/data/ljspeech/wavs/LJ001-0028.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0028.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0029.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0029.flac
--- a/tests/data/ljspeech/wavs/LJ001-0029.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0029.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0030.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0030.flac
--- a/tests/data/ljspeech/wavs/LJ001-0030.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0030.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0031.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0031.flac
--- a/tests/data/ljspeech/wavs/LJ001-0031.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0031.mp3
--- a/tests/data/ljspeech/wavs/LJ001-0032.flac
+++ b/tests/data/ljspeech/wavs/LJ001-0032.flac
--- a/tests/data/ljspeech/wavs/LJ001-0032.mp3
+++ b/tests/data/ljspeech/wavs/LJ001-0032.mp3
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@ -21,30 +21,15 @@ os.makedirs(OUTPATH, exist_ok=True)
 c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
 c.r = 5
 c.data_path = os.path.join(get_tests_data_path(), "ljspeech/")
+ok_ljspeech = os.path.exists(c.data_path)

-dataset_config_wav = BaseDatasetConfig(
-    formatter="coqui",  # ljspeech_test to multi-speaker
-    meta_file_train="metadata_wav.csv",
+dataset_config = BaseDatasetConfig(
+    formatter="ljspeech_test",  # ljspeech_test to multi-speaker
+    meta_file_train="metadata.csv",
    meta_file_val=None,
    path=c.data_path,
    language="en",
 )
-dataset_config_mp3 = BaseDatasetConfig(
-    formatter="coqui",  # ljspeech_test to multi-speaker
-    meta_file_train="metadata_mp3.csv",
-    meta_file_val=None,
-    path=c.data_path,
-    language="en",
-)
-dataset_config_flac = BaseDatasetConfig(
-    formatter="coqui",  # ljspeech_test to multi-speaker
-    meta_file_train="metadata_flac.csv",
-    meta_file_val=None,
-    path=c.data_path,
-    language="en",
-)
-
-dataset_configs = [dataset_config_wav, dataset_config_mp3, dataset_config_flac]

 DATA_EXIST = True
 if not os.path.exists(c.data_path):
@ -59,10 +44,11 @@ class TestTTSDataset(unittest.TestCase):
        self.max_loader_iter = 4
        self.ap = AudioProcessor(**c.audio)

-    def _create_dataloader(self, batch_size, r, bgs, dataset_config, start_by_longest=False, preprocess_samples=False):
+    def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):
        # load dataset
        meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
        items = meta_data_train + meta_data_eval
+
        tokenizer, _ = TTSTokenizer.init_from_config(c)
        dataset = TTSDataset(
            outputs_per_step=r,
@ -78,11 +64,6 @@ class TestTTSDataset(unittest.TestCase):
            max_audio_len=c.max_audio_len,
            start_by_longest=start_by_longest,
        )
-
-        # add preprocess to force the length computation
-        if preprocess_samples:
-            dataset.preprocess_samples()
-
        dataloader = DataLoader(
            dataset,
            batch_size=batch_size,
@ -94,8 +75,9 @@ class TestTTSDataset(unittest.TestCase):
        return dataloader, dataset

    def test_loader(self):
-        for dataset_config in dataset_configs:
-            dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config, preprocess_samples=True)
+        if ok_ljspeech:
+            dataloader, dataset = self._create_dataloader(1, 1, 0)
+
            for i, data in enumerate(dataloader):
                if i == self.max_loader_iter:
                    break
@ -122,6 +104,8 @@ class TestTTSDataset(unittest.TestCase):

                # make sure that the computed mels and the waveform match and correctly computed
                mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
+                # remove padding in mel-spectrogram
+                mel_dataloader = mel_input[0].T.numpy()[:, : mel_lengths[0]]
                # guarantee that both mel-spectrograms have the same size and that we will remove waveform padding
                mel_new = mel_new[:, : mel_lengths[0]]
                ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
@ -140,38 +124,40 @@ class TestTTSDataset(unittest.TestCase):
                    self.assertGreaterEqual(mel_input.min(), 0)

    def test_batch_group_shuffle(self):
-        dataloader, dataset = self._create_dataloader(2, c.r, 16, dataset_config_wav)
-        last_length = 0
-        frames = dataset.samples
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            avg_length = mel_lengths.numpy().mean()
-        dataloader.dataset.preprocess_samples()
-        is_items_reordered = False
-        for idx, item in enumerate(dataloader.dataset.samples):
-            if item != frames[idx]:
-                is_items_reordered = True
-                break
-        self.assertGreaterEqual(avg_length, last_length)
-        self.assertTrue(is_items_reordered)
+        if ok_ljspeech:
+            dataloader, dataset = self._create_dataloader(2, c.r, 16)
+            last_length = 0
+            frames = dataset.samples
+            for i, data in enumerate(dataloader):
+                if i == self.max_loader_iter:
+                    break
+                mel_lengths = data["mel_lengths"]
+                avg_length = mel_lengths.numpy().mean()
+            dataloader.dataset.preprocess_samples()
+            is_items_reordered = False
+            for idx, item in enumerate(dataloader.dataset.samples):
+                if item != frames[idx]:
+                    is_items_reordered = True
+                    break
+            self.assertGreaterEqual(avg_length, last_length)
+            self.assertTrue(is_items_reordered)

    def test_start_by_longest(self):
        """Test start_by_longest option.

        Ther first item of the fist batch must be longer than all the other items.
        """
-        dataloader, _ = self._create_dataloader(2, c.r, 0, dataset_config_wav, start_by_longest=True)
-        dataloader.dataset.preprocess_samples()
-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            mel_lengths = data["mel_lengths"]
-            if i == 0:
-                max_len = mel_lengths[0]
-            print(mel_lengths)
-            self.assertTrue(all(max_len >= mel_lengths))
+        if ok_ljspeech:
+            dataloader, _ = self._create_dataloader(2, c.r, 0, True)
+            dataloader.dataset.preprocess_samples()
+            for i, data in enumerate(dataloader):
+                if i == self.max_loader_iter:
+                    break
+                mel_lengths = data["mel_lengths"]
+                if i == 0:
+                    max_len = mel_lengths[0]
+                print(mel_lengths)
+                self.assertTrue(all(max_len >= mel_lengths))

    def test_padding_and_spectrograms(self):
        def check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths):
@ -186,70 +172,71 @@ class TestTTSDataset(unittest.TestCase):
            self.assertEqual(mel_lengths[idx], linear_input[idx].shape[0])
            self.assertEqual(mel_lengths[idx], mel_input[idx].shape[0])

-        dataloader, _ = self._create_dataloader(1, 1, 0, dataset_config_wav)
+        if ok_ljspeech:
+            dataloader, _ = self._create_dataloader(1, 1, 0)

-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
+            for i, data in enumerate(dataloader):
+                if i == self.max_loader_iter:
+                    break
+                linear_input = data["linear"]
+                mel_input = data["mel"]
+                mel_lengths = data["mel_lengths"]
+                stop_target = data["stop_targets"]
+                item_idx = data["item_idxs"]

-            # check mel_spec consistency
-            wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
-            mel = self.ap.melspectrogram(wav).astype("float32")
-            mel = torch.FloatTensor(mel).contiguous()
-            mel_dl = mel_input[0]
-            # NOTE: Below needs to check == 0 but due to an unknown reason
-            # there is a slight difference between two matrices.
-            # TODO: Check this assert cond more in detail.
-            self.assertLess(abs(mel.T - mel_dl).max(), 1e-5)
+                # check mel_spec consistency
+                wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32)
+                mel = self.ap.melspectrogram(wav).astype("float32")
+                mel = torch.FloatTensor(mel).contiguous()
+                mel_dl = mel_input[0]
+                # NOTE: Below needs to check == 0 but due to an unknown reason
+                # there is a slight difference between two matrices.
+                # TODO: Check this assert cond more in detail.
+                self.assertLess(abs(mel.T - mel_dl).max(), 1e-5)

-            # check mel-spec correctness
-            mel_spec = mel_input[0].cpu().numpy()
-            wav = self.ap.inv_melspectrogram(mel_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav")
+                # check mel-spec correctness
+                mel_spec = mel_input[0].cpu().numpy()
+                wav = self.ap.inv_melspectrogram(mel_spec.T)
+                self.ap.save_wav(wav, OUTPATH + "/mel_inv_dataloader.wav")
+                shutil.copy(item_idx[0], OUTPATH + "/mel_target_dataloader.wav")

-            # check linear-spec
-            linear_spec = linear_input[0].cpu().numpy()
-            wav = self.ap.inv_spectrogram(linear_spec.T)
-            self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav")
-            shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav")
+                # check linear-spec
+                linear_spec = linear_input[0].cpu().numpy()
+                wav = self.ap.inv_spectrogram(linear_spec.T)
+                self.ap.save_wav(wav, OUTPATH + "/linear_inv_dataloader.wav")
+                shutil.copy(item_idx[0], OUTPATH + "/linear_target_dataloader.wav")

-            # check the outputs
-            check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)
+                # check the outputs
+                check_conditions(0, linear_input, mel_input, stop_target, mel_lengths)

-        # Test for batch size 2
-        dataloader, _ = self._create_dataloader(2, 1, 0, dataset_config_wav)
+            # Test for batch size 2
+            dataloader, _ = self._create_dataloader(2, 1, 0)

-        for i, data in enumerate(dataloader):
-            if i == self.max_loader_iter:
-                break
-            linear_input = data["linear"]
-            mel_input = data["mel"]
-            mel_lengths = data["mel_lengths"]
-            stop_target = data["stop_targets"]
-            item_idx = data["item_idxs"]
+            for i, data in enumerate(dataloader):
+                if i == self.max_loader_iter:
+                    break
+                linear_input = data["linear"]
+                mel_input = data["mel"]
+                mel_lengths = data["mel_lengths"]
+                stop_target = data["stop_targets"]
+                item_idx = data["item_idxs"]

-            # set id to the longest sequence in the batch
-            if mel_lengths[0] > mel_lengths[1]:
-                idx = 0
-            else:
-                idx = 1
+                # set id to the longest sequence in the batch
+                if mel_lengths[0] > mel_lengths[1]:
+                    idx = 0
+                else:
+                    idx = 1

-            # check the longer item in the batch
-            check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)
+                # check the longer item in the batch
+                check_conditions(idx, linear_input, mel_input, stop_target, mel_lengths)

-            # check the other item in the batch
-            self.assertEqual(linear_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(mel_input[1 - idx, -1].sum(), 0)
-            self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1)
-            self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1])
-            self.assertEqual(len(mel_lengths.shape), 1)
+                # check the other item in the batch
+                self.assertEqual(linear_input[1 - idx, -1].sum(), 0)
+                self.assertEqual(mel_input[1 - idx, -1].sum(), 0)
+                self.assertEqual(stop_target[1, mel_lengths[1] - 1], 1)
+                self.assertEqual(stop_target[1, mel_lengths[1] :].sum(), stop_target.shape[1] - mel_lengths[1])
+                self.assertEqual(len(mel_lengths.shape), 1)

-            # check batch zero-frame conditions (zero-frame disabled)
-            # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
-            # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
+                # check batch zero-frame conditions (zero-frame disabled)
+                # assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
+                # assert (mel_input * stop_target.unsqueeze(2)).sum() == 0