From 2db67e3356a9514d8bb1f2372ac9121f2bf1c50c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Mon, 14 Feb 2022 10:49:25 +0000
Subject: [PATCH] Update dataset formatting docs

---
 docs/source/formatting_your_dataset.md | 57 +++++++++++++++++++++++---
 docs/source/index.md                   |  1 -
 2 files changed, 51 insertions(+), 7 deletions(-)
diff --git a/docs/source/formatting_your_dataset.md b/docs/source/formatting_your_dataset.md
index 3db38af0..5b1d9801 100644
--- a/docs/source/formatting_your_dataset.md
+++ b/docs/source/formatting_your_dataset.md
@@ -58,23 +58,68 @@ If you use a different dataset format then the LJSpeech or the other public data
 
 If your dataset is in a new language or it needs special normalization steps, then you need a new `text_cleaner`.
 
-What you get out of a `formatter` is a `List[List[]]` in the following format.
+What you get out of a `formatter` is a `List[Dict]` in the following format.
 
 ```
 >>> formatter(metafile_path)
-[["audio1.wav", "This is my sentence.", "MyDataset"],
-["audio1.wav", "This is maybe a sentence.", "MyDataset"],
-...
+[
+    {"audio_file":"audio1.wav", "text":"This is my sentence.", "speaker_name":"MyDataset", "language": "lang_code"},
+    {"audio_file":"audio1.wav", "text":"This is maybe a sentence.", "speaker_name":"MyDataset", "language": "lang_code"},
+    ...
 ]
 ```
 
-Each sub-list is parsed as ```["<filename>", "<transcription>", "<speaker_name">]```.
+Each sub-list is parsed as ```{"<filename>", "<transcription>", "<speaker_name">]```.
 ```<speaker_name>``` is the dataset name for single speaker datasets and it is mainly used
 in the multi-speaker models to map the speaker of the each sample. But for now, we only focus on single speaker datasets.
 
-The purpose of a `formatter` is to parse your metafile and load the audio file paths and transcriptions. Then, its output passes to a `Dataset` object. It computes features from the audio signals, calls text normalization routines, and converts raw text to
+The purpose of a `formatter` is to parse your manifest file and load the audio file paths and transcriptions.
+Then, the output is passed to the `Dataset`. It computes features from the audio signals, calls text normalization routines, and converts raw text to
 phonemes if needed.
 
+## Loading your dataset
+
+Load one of the dataset supported by 🐸TTS.
+
+```python
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+
+
+# dataset config for one of the pre-defined datasets
+dataset_config = BaseDatasetConfig(
+    name="vctk", meta_file_train="", language="en-us", path="dataset-path")
+)
+
+# load training samples
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+```
+
+Load a custom dataset with a custom formatter.
+
+```python
+from TTS.tts.datasets import load_tts_samples
+
+
+# custom formatter implementation
+def formatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-argument
+    """Assumes each line as ```<filename>|<transcription>```
+    """
+    txt_file = os.path.join(root_path, manifest_file)
+    items = []
+    speaker_name = "my_speaker"
+    with open(txt_file, "r", encoding="utf-8") as ttf:
+        for line in ttf:
+            cols = line.split("|")
+            wav_file = os.path.join(root_path, "wavs", cols[0])
+            text = cols[1]
+            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name})
+    return items
+
+# load training samples
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, formatter=formatter)
+```
+
 See `TTS.tts.datasets.TTSDataset`, a generic `Dataset` implementation for the `tts` models.
 
 See `TTS.vocoder.datasets.*`, for different `Dataset` implementations for the `vocoder` models.
diff --git a/docs/source/index.md b/docs/source/index.md
index 756cea8e..9dc5bfce 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -27,7 +27,6 @@
     formatting_your_dataset
     what_makes_a_good_dataset
     tts_datasets
-    converting_torch_to_tf
 
 .. toctree::
     :maxdepth: 2