Add addtional datasets

This commit is contained in:
loganhart420 2021-12-16 07:21:27 -05:00
parent bce143c738
commit 103c010eca
2 changed files with 119 additions and 4 deletions

View File

@ -7,6 +7,7 @@ import tarfile
import urllib import urllib
import urllib.request import urllib.request
import zipfile import zipfile
from os.path import expanduser
from typing import Any, Iterable, List, Optional from typing import Any, Iterable, List, Optional
from torch.utils.model_zoo import tqdm from torch.utils.model_zoo import tqdm
@ -183,3 +184,24 @@ def extract_archive(from_path: str, to_path: Optional[str] = None, overwrite: bo
pass pass
raise NotImplementedError(" > [!] only supports tar.gz, tgz, and zip achives.") raise NotImplementedError(" > [!] only supports tar.gz, tgz, and zip achives.")
def download_kaggle_dataset(dataset_path: str, dataset_name: str, output_path: str):
"""Download dataset from kaggle.
Args:
dataset_path (str):
This the kaggle link to the dataset. for example vctk is 'mfekadu/english-multispeaker-corpus-for-voice-cloning'
dataset_name (str): Name of the folder the dataset will be saved in.
output_path (str): Path of the location you want the dataset folder to be saved to.
"""
data_path = os.path.join(output_path, dataset_name)
try:
import kaggle # pylint: disable=import-outside-toplevel
kaggle.api.authenticate()
print(f"""\nDownloading {dataset_name}...""")
kaggle.api.dataset_download_files(dataset_path, path=data_path, unzip=True)
except OSError:
print(
f"""[!] in order to download kaggle datasets, you need to have a kaggle api token stored in your {os.path.join(expanduser('~'), '.kaggle/kaggle.json')}"""
)

View File

@ -1,6 +1,7 @@
import os import os
from typing import Optional
from TTS.utils.download import download_url, extract_archive from TTS.utils.download import download_kaggle_dataset, download_url, extract_archive
def download_ljspeech(path: str): def download_ljspeech(path: str):
@ -18,12 +19,17 @@ def download_ljspeech(path: str):
extract_archive(archive) extract_archive(archive)
def download_vctk(path: str): def download_vctk(path: str, use_kaggle: Optional[bool] = False):
"""Download and extract VCTK dataset """Download and extract VCTK dataset.
Args: Args:
path (str): path to the directory where the dataset will be stored. path (str): path to the directory where the dataset will be stored.
use_kaggle (bool, optional): Downloads vctk dataset from kaggle. Is generally faster. Defaults to False.
""" """
if use_kaggle:
download_kaggle_dataset("mfekadu/english-multispeaker-corpus-for-voice-cloning", "VCTK", path)
else:
os.makedirs(path, exist_ok=True) os.makedirs(path, exist_ok=True)
url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip" url = "https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip"
download_url(url, path) download_url(url, path)
@ -31,3 +37,90 @@ def download_vctk(path: str):
archive = os.path.join(path, basename) archive = os.path.join(path, basename)
print(" > Extracting archive file...") print(" > Extracting archive file...")
extract_archive(archive) extract_archive(archive)
def download_tweb(path: str):
"""Download and extract Tweb dataset
Args:
path (str): Path to the directory where the dataset will be stored.
"""
download_kaggle_dataset("bryanpark/the-world-english-bible-speech-dataset", "TWEB", path)
def download_libri_tts(path: str, subset: Optional[str] = "all"):
"""Download and extract libri tts dataset.
Args:
path (str): Path to the directory where the dataset will be stored.
subset (str, optional): Name of the subset to download. If you only want to download a certain
portion specify it here. Defaults to 'all'.
"""
subset_dict = {
"libri-tts-clean-100": "http://www.openslr.org/resources/60/train-clean-100.tar.gz",
"libri-tts-clean-360": "http://www.openslr.org/resources/60/train-clean-360.tar.gz",
"libri-tts-other-500": "http://www.openslr.org/resources/60/train-other-500.tar.gz",
"libri-tts-dev-clean": "http://www.openslr.org/resources/60/dev-clean.tar.gz",
"libri-tts-dev-other": "http://www.openslr.org/resources/60/dev-other.tar.gz",
"libri-tts-test-clean": "http://www.openslr.org/resources/60/test-clean.tar.gz",
"libri-tts-test-other": "http://www.openslr.org/resources/60/test-other.tar.gz",
}
os.makedirs(path, exist_ok=True)
if subset == "all":
for sub, val in subset_dict.items():
print(f" > Downloading {sub}...")
download_url(val, path)
basename = os.path.basename(val)
archive = os.path.join(path, basename)
print(" > Extracting archive file...")
extract_archive(archive)
print(" > All subsets downloaded")
else:
url = subset_dict[subset]
download_url(url, path)
basename = os.path.basename(url)
archive = os.path.join(path, basename)
print(" > Extracting archive file...")
extract_archive(archive)
def download_thorsten_de(path: str):
"""Download and extract Thorsten german male voice dataset.
Args:
path (str): Path to the directory where the dataset will be stored.
"""
os.makedirs(path, exist_ok=True)
url = "https://www.openslr.org/resources/95/thorsten-de_v02.tgz"
download_url(url, path)
basename = os.path.basename(url)
archive = os.path.join(path, basename)
print(" > Extracting archive file...")
extract_archive(archive)
def download_mailabs(path: str, language: str = "english"):
"""Download and extract Mailabs dataset.
Args:
path (str): Path to the directory where the dataset will be stored.
language (str): Language subset to download. Defaults to english.
"""
language_dict = {
"english": "https://data.solak.de/data/Training/stt_tts/en_US.tgz",
"german": "https://data.solak.de/data/Training/stt_tts/de_DE.tgz",
"french": "https://data.solak.de/data/Training/stt_tts/fr_FR.tgz",
"italian": "https://data.solak.de/data/Training/stt_tts/it_IT.tgz",
"spanish": "https://data.solak.de/data/Training/stt_tts/es_ES.tgz",
}
os.makedirs(path, exist_ok=True)
url = language_dict[language]
download_url(url, path)
basename = os.path.basename(url)
archive = os.path.join(path, basename)
print(" > Extracting archive file...")
extract_archive(archive)