Implement tarring datasets

2021-10-01 16:29:45 +00:00 · 2021-10-01 16:29:45 +00:00 · 2447f42ca1
parent 42f77e7185
commit 2447f42ca1
4 changed files with 142 additions and 8 deletions
--- a/TTS/bin/stt/create_tarred_dataset.py
+++ b/TTS/bin/stt/create_tarred_dataset.py
@ -0,0 +1,134 @@
 import json
 import os
 import random
 import tarfile
 from dataclasses import dataclass, field
 from datetime import datetime
 from typing import List
 from coqpit import Coqpit
 from multiprocessing import Pool
 from TTS.stt.datasets import load_stt_samples
 from TTS.config import BaseDatasetConfig
@dataclass
 class ConvertedArgs(Coqpit):
    dataset_name: List[str] = field(
        default="",
        metadata={
            "help": "Name of the dataset(s) or the dataset format(s). Provided name(s) must be implemented in `stt.datasets.formatters`."
        },
    )
    dataset_path: List[str] = field(default_factory=list, metadata={"help": "Path(s) to the dataset(s)."})
    output_path: str = field(default="", metadata={"help": "Path to the output directory to save the tar shards."})
    num_shards: int = field(default=-1, metadata={"help": "Number of tarballs to create."})
    shuffle: bool = field(default=False, metadata={"help": "Shuffle the samples before tarring."})
    num_workers: int = field(default=1, metadata={"help": "Number of workers to use for parallelization."})
@dataclass
 class TarMetadata(Coqpit):
    args: ConvertedArgs = field(default_factory=ConvertedArgs)
    created_date: str = field(default="", metadata={"help": "Date of creation of the tarball dataset."})
    num_samples_per_shard: int = field(default=0, metadata={"help": "Number of samples per tarball."})
    def __post_init__(self):
        self.created_date = self.get_date()
    @staticmethod
    def get_date():
        datetime.now().strftime("%m-%d-%Y %H-%M-%S")
 def create_tar_shard(params):
    samples = params[0]
    output_path = params[1]
    shard_no = params[2]
    sharded_samples = []
    with tarfile.open(os.path.join(output_path, f'audio_{shard_no}.tar'), mode='w') as tar:
        count = {}
        for sample in samples:
            # We squash the filename since we do not preserve directory structure of audio files in the tarball.
            base, ext = os.path.splitext(sample['audio_file'])
            base = base.replace('/', '_')
            # Need the following replacement as long as WebDataset splits on first period
            base = base.replace('.', '_')
            squashed_filename = f'{base}{ext}'
            if squashed_filename not in count:
                tar.add(sample['audio_file'], arcname=squashed_filename)
            if "duration" in sample:
                duration = sample['duration']
            else:
                # TODO: not sure if this returns the right value
                duration = os.path.getsize(sample["audio_file"])
            count[squashed_filename] = 1
            sharded_sample = {
                'audio_file': squashed_filename,
                'duration': duration,
                'text': sample['text'],
                'shard_no': shard_no,  # Keep shard ID for recordkeeping
            }
            sharded_samples.append(sharded_sample)
    return sharded_samples
 if __name__ == "__main__":
    # parse command line arguments
    args = ConvertedArgs()
    args.parse_args(arg_prefix="")
    os.makedirs(args.output_path, exist_ok=True)
    # create tarring metadata config
    metadata_config = TarMetadata(args=args)
    # create dataset configs
    dataset_configs = []
    for dataset_name, dataset_path in zip(args.dataset_name, args.dataset_path):
        dataset_config = BaseDatasetConfig(name=dataset_name, path=dataset_path)
        dataset_configs.append(dataset_config)
    # load dataset samples
    samples, _ = load_stt_samples(dataset_configs, eval_split=False)
    print(f" > Number of data samples: {len(samples)}")
    # shuffle samples
    if args.shuffle:
        print(" > Shuffling data samples...")
        random.shuffle(samples)
    # define shard sample indices
    start_indices = []
    end_indices = []
    shard_size = (len(samples) // args.num_shards)
    for i in range(args.num_shards):
        start_idx = shard_size * i
        end_idx = start_idx + shard_size
        print(f" > Shard {i}: {start_idx} --> {end_idx}")
        if end_idx > len(samples):
            # discard the last shard to keep shard size the same
            print(f"Have {len(samples) - end_idx} entries left over that will be discarded.")
        start_indices.append(start_idx)
        end_indices.append(end_idx)
    # create shards
    with Pool(args.num_workers) as pool:
        process_samples = [samples[start_idx:end_idx] for start_idx, end_idx in zip(start_indices, end_indices)]
        process_args = zip(process_samples, [args.output_path]*args.num_shards, range(args.num_shards))
        sharded_samples = pool.map(create_tar_shard, process_args)
    sharded_samples = [sample for sharded_sample in sharded_samples for sample in sharded_sample]
    print(f" > Total number of files sharded: {len(sharded_samples)}")
    # Write manifest
    metadata_path = os.path.join(args.output_path, 'coqui_tarred_dataset.json')
    with open(metadata_path, 'w', encoding="utf8") as m2:
        for entry in sharded_samples:
            json.dump(entry, m2)
            m2.write('\n')
    # Write metadata (default metadata for new datasets)
    metadata_config.num_samples_per_shard = shard_size
    metadata_path = os.path.join(args.output_path, 'metadata.json')
    metadata_config.save_json(metadata_path)
--- a/TTS/stt/datasets/downloaders.py
+++ b/TTS/stt/datasets/downloaders.py
@ -1,12 +1,12 @@
 from tqdm import tqdm
 import glob
 import os
 from multiprocessing import Pool
 from TTS.stt.utils.download import download_url, extract_archive
 from TTS.stt.datasets.formatters import *
 import librosa
 import soundfile as sf
 from tqdm import tqdm
 from TTS.stt.utils.download import download_url, extract_archive
 def _resample_file(func_args):
@ -68,4 +68,4 @@ def download_librispeech(path: str, split_name: str):
 if __name__ == "__main__":
    # download_librispeech("/home/ubuntu/librispeech/", "train-clean-100")
-    download_ljspeech("/home/ubuntu/ljspeech/", n_jobs=8)
+    # download_ljspeech("/home/ubuntu/ljspeech/", n_jobs=8)
--- a/TTS/stt/datasets/formatters.py
+++ b/TTS/stt/datasets/formatters.py
@ -32,7 +32,7 @@ def librispeech(root_path, meta_files=None):
    _delimiter = " "
    _audio_ext = ".flac"
    items = []
-    if meta_files is None:
+    if meta_files is None or meta_files == "":
        meta_files = glob(f"{root_path}/**/*trans.txt", recursive=True)
    else:
        if isinstance(meta_files, str):
--- a/recipes/librispeech/stt/deep_speech/train_deep_speech.py
+++ b/recipes/librispeech/stt/deep_speech/train_deep_speech.py
@ -52,7 +52,7 @@ audio_config = BaseAudioConfig(
 config = DeepSpeechConfig(
    audio=audio_config,
    run_name="deepspeech_librispeech",
-    batch_size=128,
+    batch_size=64,
    eval_batch_size=16,
    batch_group_size=5,
    num_loader_workers=4,