mirror of https://github.com/coqui-ai/TTS.git
Update dataset formatting docs
This commit is contained in:
parent
58c38de58d
commit
2db67e3356
|
@ -58,23 +58,68 @@ If you use a different dataset format then the LJSpeech or the other public data
|
||||||
|
|
||||||
If your dataset is in a new language or it needs special normalization steps, then you need a new `text_cleaner`.
|
If your dataset is in a new language or it needs special normalization steps, then you need a new `text_cleaner`.
|
||||||
|
|
||||||
What you get out of a `formatter` is a `List[List[]]` in the following format.
|
What you get out of a `formatter` is a `List[Dict]` in the following format.
|
||||||
|
|
||||||
```
|
```
|
||||||
>>> formatter(metafile_path)
|
>>> formatter(metafile_path)
|
||||||
[["audio1.wav", "This is my sentence.", "MyDataset"],
|
[
|
||||||
["audio1.wav", "This is maybe a sentence.", "MyDataset"],
|
{"audio_file":"audio1.wav", "text":"This is my sentence.", "speaker_name":"MyDataset", "language": "lang_code"},
|
||||||
...
|
{"audio_file":"audio1.wav", "text":"This is maybe a sentence.", "speaker_name":"MyDataset", "language": "lang_code"},
|
||||||
|
...
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
Each sub-list is parsed as ```["<filename>", "<transcription>", "<speaker_name">]```.
|
Each sub-list is parsed as ```{"<filename>", "<transcription>", "<speaker_name">]```.
|
||||||
```<speaker_name>``` is the dataset name for single speaker datasets and it is mainly used
|
```<speaker_name>``` is the dataset name for single speaker datasets and it is mainly used
|
||||||
in the multi-speaker models to map the speaker of the each sample. But for now, we only focus on single speaker datasets.
|
in the multi-speaker models to map the speaker of the each sample. But for now, we only focus on single speaker datasets.
|
||||||
|
|
||||||
The purpose of a `formatter` is to parse your metafile and load the audio file paths and transcriptions. Then, its output passes to a `Dataset` object. It computes features from the audio signals, calls text normalization routines, and converts raw text to
|
The purpose of a `formatter` is to parse your manifest file and load the audio file paths and transcriptions.
|
||||||
|
Then, the output is passed to the `Dataset`. It computes features from the audio signals, calls text normalization routines, and converts raw text to
|
||||||
phonemes if needed.
|
phonemes if needed.
|
||||||
|
|
||||||
|
## Loading your dataset
|
||||||
|
|
||||||
|
Load one of the dataset supported by 🐸TTS.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
|
||||||
|
|
||||||
|
# dataset config for one of the pre-defined datasets
|
||||||
|
dataset_config = BaseDatasetConfig(
|
||||||
|
name="vctk", meta_file_train="", language="en-us", path="dataset-path")
|
||||||
|
)
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
Load a custom dataset with a custom formatter.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from TTS.tts.datasets import load_tts_samples
|
||||||
|
|
||||||
|
|
||||||
|
# custom formatter implementation
|
||||||
|
def formatter(root_path, manifest_file, **kwargs): # pylint: disable=unused-argument
|
||||||
|
"""Assumes each line as ```<filename>|<transcription>```
|
||||||
|
"""
|
||||||
|
txt_file = os.path.join(root_path, manifest_file)
|
||||||
|
items = []
|
||||||
|
speaker_name = "my_speaker"
|
||||||
|
with open(txt_file, "r", encoding="utf-8") as ttf:
|
||||||
|
for line in ttf:
|
||||||
|
cols = line.split("|")
|
||||||
|
wav_file = os.path.join(root_path, "wavs", cols[0])
|
||||||
|
text = cols[1]
|
||||||
|
items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name})
|
||||||
|
return items
|
||||||
|
|
||||||
|
# load training samples
|
||||||
|
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, formatter=formatter)
|
||||||
|
```
|
||||||
|
|
||||||
See `TTS.tts.datasets.TTSDataset`, a generic `Dataset` implementation for the `tts` models.
|
See `TTS.tts.datasets.TTSDataset`, a generic `Dataset` implementation for the `tts` models.
|
||||||
|
|
||||||
See `TTS.vocoder.datasets.*`, for different `Dataset` implementations for the `vocoder` models.
|
See `TTS.vocoder.datasets.*`, for different `Dataset` implementations for the `vocoder` models.
|
||||||
|
|
|
@ -27,7 +27,6 @@
|
||||||
formatting_your_dataset
|
formatting_your_dataset
|
||||||
what_makes_a_good_dataset
|
what_makes_a_good_dataset
|
||||||
tts_datasets
|
tts_datasets
|
||||||
converting_torch_to_tf
|
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
|
|
Loading…
Reference in New Issue