From c909ca3855b28e0d4a6b90bf9071c746dfcec89b Mon Sep 17 00:00:00 2001 From: mueller Date: Wed, 16 Sep 2020 15:55:55 +0200 Subject: [PATCH] Improve runtime of __parse_items() from O(|speakers|*|items|) to O(|items|) --- TTS/speaker_encoder/config.json | 38 ++++++++++++++++++++--- TTS/speaker_encoder/dataset.py | 10 +++--- TTS/tts/datasets/preprocess.py | 54 ++++++++++++++++++++++++++++++--- 3 files changed, 90 insertions(+), 12 deletions(-) diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index 11da0cf6..2a063fbf 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -1,6 +1,6 @@ { - "run_name": "Model compatible to CorentinJ/Real-Time-Voice-Cloning", + "run_name": "mueller91", "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", "audio":{ // Audio processing parameters @@ -41,7 +41,7 @@ "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 1, // Number of steps to log traning on console. - "output_path": "../../checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. "model": { "input_dim": 40, "proj_dim": 256, @@ -52,8 +52,38 @@ "datasets": [ { - "name": "vctk", - "path": "../../../datasets/VCTK-Corpus-removed-silence/", + "name": "voxceleb1", + "path": "../../audio-datasets/en/voxceleb1/", + "meta_file_train": null, + "meta_file_val": null + }, +// { +// "name": "voxceleb2", +// "path": "../../audio-datasets/en/voxceleb2/", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "vctk", +// "path": "../../audio-datasets/en/VCTK-Corpus/", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "libri_tts", +// "path": "../../audio-datasets/en/LibriTTS/train-clean-100", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "libri_tts", +// "path": "../../audio-datasets/en/LibriTTS/train-clean-360", +// "meta_file_train": null, +// "meta_file_val": null +// }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-other-500", "meta_file_train": null, "meta_file_val": null } diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index ad6b95e9..00e5eace 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -2,6 +2,7 @@ import numpy as np import torch import random from torch.utils.data import Dataset +from tqdm import tqdm class MyDataset(Dataset): @@ -53,6 +54,7 @@ class MyDataset(Dataset): def __parse_items(self): self.speaker_to_utters = {} for i in self.items: + text_ = i[0] path_ = i[1] speaker_ = i[2] if speaker_ in self.speaker_to_utters.keys(): @@ -60,11 +62,11 @@ class MyDataset(Dataset): else: self.speaker_to_utters[speaker_] = [path_, ] - if self.skip_speakers: - self.speaker_to_utters = {k: v for (k, v) in self.speaker_to_utters.items() if - len(v) >= self.num_utter_per_speaker} + if self.skip_speakers: + self.speaker_to_utters = {k: v for (k, v) in self.speaker_to_utters.items() if + len(v) >= self.num_utter_per_speaker} - self.speakers = [k for (k, v) in self.speaker_to_utters] + self.speakers = [k for (k, v) in self.speaker_to_utters.items()] # def __parse_items(self): # """ diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 4bfad648..40fc66dd 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -2,6 +2,10 @@ import os from glob import glob import re import sys +from pathlib import Path + +from tqdm import tqdm + from TTS.tts.utils.generic_utils import split_dataset @@ -16,6 +20,7 @@ def load_meta_data(datasets): preprocessor = get_preprocessor_by_name(name) meta_data_train = preprocessor(root_path, meta_file_train) + print(f"Found {len(meta_data_train)} files in {Path(root_path).absolute()}") if meta_file_val is None: meta_data_eval, meta_data_train = split_dataset(meta_data_train) else: @@ -187,7 +192,7 @@ def libri_tts(root_path, meta_files=None): cols = line.split('\t') wav_file = os.path.join(_root_path, cols[0] + '.wav') text = cols[1] - items.append([text, wav_file, speaker_name]) + items.append([text, wav_file, 'LTTS_' + speaker_name]) for item in items: assert os.path.exists( item[1]), f" [!] wav files don't exist - {item[1]}" @@ -235,8 +240,7 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" test_speakers = meta_files items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", - recursive=True) + meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) @@ -247,8 +251,50 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): continue with open(meta_file) as file_text: text = file_text.readlines()[0] - wav_file = os.path.join(root_path, wavs_path, speaker_id, + wav_file = os.path.join(root_path, wavs_path, 'VCTK_' + speaker_id, file_id + '.wav') items.append([text, wav_file, speaker_id]) return items + +# ======================================== VOX CELEB =========================================== +def voxceleb2(root_path, meta_file): + """ + :param meta_file Used only for consistency with load_meta_data api + """ + return _voxcel_x(root_path, voxcel_idx="2") + + +def voxceleb1(root_path, meta_file): + """ + :param meta_file Used only for consistency with load_meta_data api + """ + return _voxcel_x(root_path, voxcel_idx="1") + + +def _voxcel_x(root_path, voxcel_idx): + assert voxcel_idx in ["1", "2"] + expected_count = 148_000 if voxcel_idx == "1" else 1_000_000 + voxceleb_path = Path(root_path) + cache_to = voxceleb_path / f"metafile_voxceleb{voxcel_idx}.csv" + cache_to.parent.mkdir(exist_ok=True) + + # if not exists meta file, crawl recursively for 'wav' files + if not cache_to.exists(): + cnt = 0 + meta_data = "" + wav_files = voxceleb_path.rglob("**/*.wav") + for path in tqdm(wav_files, desc=f"Building VoxCeleb {voxcel_idx} Meta file ... this needs to be done only once.", + total=expected_count): + speaker_id = str(Path(path).parent.parent.stem) + assert speaker_id.startswith('id') + text = None # VoxCel does not provide transciptions, and they are not needed for training the SE + meta_data += f"{text}|{path}|voxcel{voxcel_idx}_{speaker_id}\n" + cnt += 1 + with open(str(cache_to), 'w') as f: + f.write(meta_data) + if cnt < expected_count: + raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}") + + with open(str(cache_to), 'r') as f: + return [x.strip().split('|') for x in f.readlines()]