From 2db67e3356a9514d8bb1f2372ac9121f2bf1c50c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 14 Feb 2022 10:49:25 +0000 Subject: [PATCH] Update dataset formatting docs --- docs/source/formatting_your_dataset.md | 57 +++++++++++++++++++++++--- docs/source/index.md | 1 - 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/docs/source/formatting_your_dataset.md b/docs/source/formatting_your_dataset.md index 3db38af0..5b1d9801 100644 --- a/docs/source/formatting_your_dataset.md +++ b/docs/source/formatting_your_dataset.md @@ -58,23 +58,68 @@ If you use a different dataset format then the LJSpeech or the other public data If your dataset is in a new language or it needs special normalization steps, then you need a new `text_cleaner`. -What you get out of a `formatter` is a `List[List[]]` in the following format. +What you get out of a `formatter` is a `List[Dict]` in the following format. ``` >>> formatter(metafile_path) -[["audio1.wav", "This is my sentence.", "MyDataset"], -["audio1.wav", "This is maybe a sentence.", "MyDataset"], -... +[ + {"audio_file":"audio1.wav", "text":"This is my sentence.", "speaker_name":"MyDataset", "language": "lang_code"}, + {"audio_file":"audio1.wav", "text":"This is maybe a sentence.", "speaker_name":"MyDataset", "language": "lang_code"}, + ... ] ``` -Each sub-list is parsed as ```["", "", "]```. +Each sub-list is parsed as ```{"", "", "]```. `````` is the dataset name for single speaker datasets and it is mainly used in the multi-speaker models to map the speaker of the each sample. But for now, we only focus on single speaker datasets. -The purpose of a `formatter` is to parse your metafile and load the audio file paths and transcriptions. Then, its output passes to a `Dataset` object. It computes features from the audio signals, calls text normalization routines, and converts raw text to +The purpose of a `formatter` is to parse your manifest file and load the audio file paths and transcriptions. +Then, the output is passed to the `Dataset`. It computes features from the audio signals, calls text normalization routines, and converts raw text to phonemes if needed. +## Loading your dataset + +Load one of the dataset supported by 🐸TTS. + +```python +from TTS.tts.configs.shared_configs import BaseDatasetConfig +from TTS.tts.datasets import load_tts_samples + + +# dataset config for one of the pre-defined datasets +dataset_config = BaseDatasetConfig( + name="vctk", meta_file_train="", language="en-us", path="dataset-path") +) + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +``` + +Load a custom dataset with a custom formatter. + +```python +from TTS.tts.datasets import load_tts_samples + + +# custom formatter implementation +def formatter(root_path, manifest_file, **kwargs): # pylint: disable=unused-argument + """Assumes each line as ```|``` + """ + txt_file = os.path.join(root_path, manifest_file) + items = [] + speaker_name = "my_speaker" + with open(txt_file, "r", encoding="utf-8") as ttf: + for line in ttf: + cols = line.split("|") + wav_file = os.path.join(root_path, "wavs", cols[0]) + text = cols[1] + items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name}) + return items + +# load training samples +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, formatter=formatter) +``` + See `TTS.tts.datasets.TTSDataset`, a generic `Dataset` implementation for the `tts` models. See `TTS.vocoder.datasets.*`, for different `Dataset` implementations for the `vocoder` models. diff --git a/docs/source/index.md b/docs/source/index.md index 756cea8e..9dc5bfce 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -27,7 +27,6 @@ formatting_your_dataset what_makes_a_good_dataset tts_datasets - converting_torch_to_tf .. toctree:: :maxdepth: 2