mirror of https://github.com/coqui-ai/TTS.git
Add addtional datasets
This commit is contained in:
parent
bce143c738
commit
103c010eca
|
@ -7,6 +7,7 @@ import tarfile
|
||||||
import urllib
|
import urllib
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import zipfile
|
import zipfile
|
||||||
|
from os.path import expanduser
|
||||||
from typing import Any, Iterable, List, Optional
|
from typing import Any, Iterable, List, Optional
|
||||||
|
|
||||||
from torch.utils.model_zoo import tqdm
|
from torch.utils.model_zoo import tqdm
|
||||||
|
@ -183,3 +184,24 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bo
|
||||||
pass
|
pass
|
||||||
|
|
||||||
raise NotImplementedError(" > [!] only supports tar.gz, tgz, and zip achives.")
|
raise NotImplementedError(" > [!] only supports tar.gz, tgz, and zip achives.")
|
||||||
|
|
||||||
|
|
||||||
|
def download_kaggle_dataset(dataset_path: str, dataset_name: str, output_path: str):
|
||||||
|
"""Download dataset from kaggle.
|
||||||
|
Args:
|
||||||
|
dataset_path (str):
|
||||||
|
This the kaggle link to the dataset. for example vctk is 'mfekadu/english-multispeaker-corpus-for-voice-cloning'
|
||||||
|
dataset_name (str): Name of the folder the dataset will be saved in.
|
||||||
|
output_path (str): Path of the location you want the dataset folder to be saved to.
|
||||||
|
"""
|
||||||
|
data_path = os.path.join(output_path, dataset_name)
|
||||||
|
try:
|
||||||
|
import kaggle # pylint: disable=import-outside-toplevel
|
||||||
|
|
||||||
|
kaggle.api.authenticate()
|
||||||
|
print(f"""\nDownloading {dataset_name}...""")
|
||||||
|
kaggle.api.dataset_download_files(dataset_path, path=data_path, unzip=True)
|
||||||
|
except OSError:
|
||||||
|
print(
|
||||||
|
f"""[!] in order to download kaggle datasets, you need to have a kaggle api token stored in your {os.path.join(expanduser('~'), '.kaggle/kaggle.json')}"""
|
||||||
|
)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
from TTS.utils.download import download_url, extract_archive
|
from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive
|
||||||
|
|
||||||
|
|
||||||
def download_ljspeech(path: str):
|
def download_ljspeech(path: str):
|
||||||
|
@ -18,14 +19,106 @@ def download_ljspeech(path: str):
|
||||||
extract_archive(archive)
|
extract_archive(archive)
|
||||||
|
|
||||||
|
|
||||||
def download_vctk(path: str):
|
def download_vctk(path: str, use_kaggle: Optional[bool] = False):
|
||||||
"""Download and extract VCTK dataset
|
"""Download and extract VCTK dataset.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path (str): path to the directory where the dataset will be stored.
|
path (str): path to the directory where the dataset will be stored.
|
||||||
|
|
||||||
|
use_kaggle (bool, optional): Downloads vctk dataset from kaggle. Is generally faster. Defaults to False.
|
||||||
|
"""
|
||||||
|
if use_kaggle:
|
||||||
|
download_kaggle_dataset("mfekadu/english-multispeaker-corpus-for-voice-cloning", "VCTK", path)
|
||||||
|
else:
|
||||||
|
os.makedirs(path, exist_ok=True)
|
||||||
|
url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
|
||||||
|
download_url(url, path)
|
||||||
|
basename = os.path.basename(url)
|
||||||
|
archive = os.path.join(path, basename)
|
||||||
|
print(" > Extracting archive file...")
|
||||||
|
extract_archive(archive)
|
||||||
|
|
||||||
|
|
||||||
|
def download_tweb(path: str):
|
||||||
|
"""Download and extract Tweb dataset
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Path to the directory where the dataset will be stored.
|
||||||
|
"""
|
||||||
|
download_kaggle_dataset("bryanpark/the-world-english-bible-speech-dataset", "TWEB", path)
|
||||||
|
|
||||||
|
|
||||||
|
def download_libri_tts(path: str, subset: Optional[str] = "all"):
|
||||||
|
"""Download and extract libri tts dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Path to the directory where the dataset will be stored.
|
||||||
|
|
||||||
|
subset (str, optional): Name of the subset to download. If you only want to download a certain
|
||||||
|
portion specify it here. Defaults to 'all'.
|
||||||
|
"""
|
||||||
|
|
||||||
|
subset_dict = {
|
||||||
|
"libri-tts-clean-100": "http://www.openslr.org/resources/60/train-clean-100.tar.gz",
|
||||||
|
"libri-tts-clean-360": "http://www.openslr.org/resources/60/train-clean-360.tar.gz",
|
||||||
|
"libri-tts-other-500": "http://www.openslr.org/resources/60/train-other-500.tar.gz",
|
||||||
|
"libri-tts-dev-clean": "http://www.openslr.org/resources/60/dev-clean.tar.gz",
|
||||||
|
"libri-tts-dev-other": "http://www.openslr.org/resources/60/dev-other.tar.gz",
|
||||||
|
"libri-tts-test-clean": "http://www.openslr.org/resources/60/test-clean.tar.gz",
|
||||||
|
"libri-tts-test-other": "http://www.openslr.org/resources/60/test-other.tar.gz",
|
||||||
|
}
|
||||||
|
|
||||||
|
os.makedirs(path, exist_ok=True)
|
||||||
|
if subset == "all":
|
||||||
|
for sub, val in subset_dict.items():
|
||||||
|
print(f" > Downloading {sub}...")
|
||||||
|
download_url(val, path)
|
||||||
|
basename = os.path.basename(val)
|
||||||
|
archive = os.path.join(path, basename)
|
||||||
|
print(" > Extracting archive file...")
|
||||||
|
extract_archive(archive)
|
||||||
|
print(" > All subsets downloaded")
|
||||||
|
else:
|
||||||
|
url = subset_dict[subset]
|
||||||
|
download_url(url, path)
|
||||||
|
basename = os.path.basename(url)
|
||||||
|
archive = os.path.join(path, basename)
|
||||||
|
print(" > Extracting archive file...")
|
||||||
|
extract_archive(archive)
|
||||||
|
|
||||||
|
|
||||||
|
def download_thorsten_de(path: str):
|
||||||
|
"""Download and extract Thorsten german male voice dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Path to the directory where the dataset will be stored.
|
||||||
"""
|
"""
|
||||||
os.makedirs(path, exist_ok=True)
|
os.makedirs(path, exist_ok=True)
|
||||||
url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
|
url = "https://www.openslr.org/resources/95/thorsten-de_v02.tgz"
|
||||||
|
download_url(url, path)
|
||||||
|
basename = os.path.basename(url)
|
||||||
|
archive = os.path.join(path, basename)
|
||||||
|
print(" > Extracting archive file...")
|
||||||
|
extract_archive(archive)
|
||||||
|
|
||||||
|
|
||||||
|
def download_mailabs(path: str, language: str = "english"):
|
||||||
|
"""Download and extract Mailabs dataset.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Path to the directory where the dataset will be stored.
|
||||||
|
|
||||||
|
language (str): Language subset to download. Defaults to english.
|
||||||
|
"""
|
||||||
|
language_dict = {
|
||||||
|
"english": "https://data.solak.de/data/Training/stt_tts/en_US.tgz",
|
||||||
|
"german": "https://data.solak.de/data/Training/stt_tts/de_DE.tgz",
|
||||||
|
"french": "https://data.solak.de/data/Training/stt_tts/fr_FR.tgz",
|
||||||
|
"italian": "https://data.solak.de/data/Training/stt_tts/it_IT.tgz",
|
||||||
|
"spanish": "https://data.solak.de/data/Training/stt_tts/es_ES.tgz",
|
||||||
|
}
|
||||||
|
os.makedirs(path, exist_ok=True)
|
||||||
|
url = language_dict[language]
|
||||||
download_url(url, path)
|
download_url(url, path)
|
||||||
basename = os.path.basename(url)
|
basename = os.path.basename(url)
|
||||||
archive = os.path.join(path, basename)
|
archive = os.path.join(path, basename)
|
||||||
|
|
Loading…
Reference in New Issue