From 60ce86211392405f2a8e8a575ea5ad80cb7d0107 Mon Sep 17 00:00:00 2001 From: maxbachmann Date: Mon, 14 Sep 2020 23:55:34 +0200 Subject: [PATCH 01/10] use difflib for string matching --- TTS/bin/convert_melgan_torch_to_tf.py | 4 ++-- TTS/bin/convert_tacotron2_torch_to_tf.py | 4 ++-- requirements.txt | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/TTS/bin/convert_melgan_torch_to_tf.py b/TTS/bin/convert_melgan_torch_to_tf.py index a006b7b2..2eec6157 100644 --- a/TTS/bin/convert_melgan_torch_to_tf.py +++ b/TTS/bin/convert_melgan_torch_to_tf.py @@ -1,10 +1,10 @@ import argparse +from difflib import SequenceMatcher import os import numpy as np import tensorflow as tf import torch -from fuzzywuzzy import fuzz from TTS.utils.io import load_config from TTS.vocoder.tf.utils.convert_torch_to_tf_utils import ( @@ -67,7 +67,7 @@ for tf_name in tf_var_names: continue tf_name_edited = convert_tf_name(tf_name) ratios = [ - fuzz.ratio(torch_name, tf_name_edited) + SequenceMatcher(None, torch_name, tf_name_edited).ratio() for torch_name in torch_var_names ] max_idx = np.argmax(ratios) diff --git a/TTS/bin/convert_tacotron2_torch_to_tf.py b/TTS/bin/convert_tacotron2_torch_to_tf.py index 2ba1aa16..485e56be 100644 --- a/TTS/bin/convert_tacotron2_torch_to_tf.py +++ b/TTS/bin/convert_tacotron2_torch_to_tf.py @@ -1,6 +1,7 @@ # %% # %% import argparse +from difflib import SequenceMatcher import os import sys # %% @@ -10,7 +11,6 @@ from pprint import pprint import numpy as np import tensorflow as tf import torch -from fuzzywuzzy import fuzz from TTS.tts.tf.models.tacotron2 import Tacotron2 from TTS.tts.tf.utils.convert_torch_to_tf_utils import ( compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf) @@ -106,7 +106,7 @@ for tf_name in tf_var_names: continue tf_name_edited = convert_tf_name(tf_name) ratios = [ - fuzz.ratio(torch_name, tf_name_edited) + SequenceMatcher(None, torch_name, tf_name_edited).ratio() for torch_name in torch_var_names ] max_idx = np.argmax(ratios) diff --git a/requirements.txt b/requirements.txt index fdec4c57..f0f2c057 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,5 +20,4 @@ soundfile nose==1.3.7 cardboardlint==1.3.0 pylint==2.5.3 -fuzzywuzzy gdown From d733b902552295be41b1f88c8343b5e7b4ff8a9b Mon Sep 17 00:00:00 2001 From: mueller Date: Wed, 16 Sep 2020 15:09:02 +0200 Subject: [PATCH 02/10] Improve runtime of __parse_items() from O(|speakers|*|items|) to O(|items|) --- TTS/speaker_encoder/dataset.py | 42 +++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index e67dd716..ad6b95e9 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -51,21 +51,37 @@ class MyDataset(Dataset): return sample def __parse_items(self): - """ - Find unique speaker ids and create a dict mapping utterances from speaker id - """ - speakers = list({item[-1] for item in self.items}) self.speaker_to_utters = {} - self.speakers = [] - for speaker in speakers: - speaker_utters = [item[1] for item in self.items if item[2] == speaker] - if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers: - print( - f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}." - ) + for i in self.items: + path_ = i[1] + speaker_ = i[2] + if speaker_ in self.speaker_to_utters.keys(): + self.speaker_to_utters[speaker_].append(path_) else: - self.speakers.append(speaker) - self.speaker_to_utters[speaker] = speaker_utters + self.speaker_to_utters[speaker_] = [path_, ] + + if self.skip_speakers: + self.speaker_to_utters = {k: v for (k, v) in self.speaker_to_utters.items() if + len(v) >= self.num_utter_per_speaker} + + self.speakers = [k for (k, v) in self.speaker_to_utters] + + # def __parse_items(self): + # """ + # Find unique speaker ids and create a dict mapping utterances from speaker id + # """ + # speakers = list({item[-1] for item in self.items}) + # self.speaker_to_utters = {} + # self.speakers = [] + # for speaker in speakers: + # speaker_utters = [item[1] for item in self.items if item[2] == speaker] + # if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers: + # print( + # f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}." + # ) + # else: + # self.speakers.append(speaker) + # self.speaker_to_utters[speaker] = speaker_utters def __len__(self): return int(1e10) From c909ca3855b28e0d4a6b90bf9071c746dfcec89b Mon Sep 17 00:00:00 2001 From: mueller Date: Wed, 16 Sep 2020 15:55:55 +0200 Subject: [PATCH 03/10] Improve runtime of __parse_items() from O(|speakers|*|items|) to O(|items|) --- TTS/speaker_encoder/config.json | 38 ++++++++++++++++++++--- TTS/speaker_encoder/dataset.py | 10 +++--- TTS/tts/datasets/preprocess.py | 54 ++++++++++++++++++++++++++++++--- 3 files changed, 90 insertions(+), 12 deletions(-) diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index 11da0cf6..2a063fbf 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -1,6 +1,6 @@ { - "run_name": "Model compatible to CorentinJ/Real-Time-Voice-Cloning", + "run_name": "mueller91", "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", "audio":{ // Audio processing parameters @@ -41,7 +41,7 @@ "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. "print_step": 1, // Number of steps to log traning on console. - "output_path": "../../checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. "model": { "input_dim": 40, "proj_dim": 256, @@ -52,8 +52,38 @@ "datasets": [ { - "name": "vctk", - "path": "../../../datasets/VCTK-Corpus-removed-silence/", + "name": "voxceleb1", + "path": "../../audio-datasets/en/voxceleb1/", + "meta_file_train": null, + "meta_file_val": null + }, +// { +// "name": "voxceleb2", +// "path": "../../audio-datasets/en/voxceleb2/", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "vctk", +// "path": "../../audio-datasets/en/VCTK-Corpus/", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "libri_tts", +// "path": "../../audio-datasets/en/LibriTTS/train-clean-100", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "libri_tts", +// "path": "../../audio-datasets/en/LibriTTS/train-clean-360", +// "meta_file_train": null, +// "meta_file_val": null +// }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-other-500", "meta_file_train": null, "meta_file_val": null } diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index ad6b95e9..00e5eace 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -2,6 +2,7 @@ import numpy as np import torch import random from torch.utils.data import Dataset +from tqdm import tqdm class MyDataset(Dataset): @@ -53,6 +54,7 @@ class MyDataset(Dataset): def __parse_items(self): self.speaker_to_utters = {} for i in self.items: + text_ = i[0] path_ = i[1] speaker_ = i[2] if speaker_ in self.speaker_to_utters.keys(): @@ -60,11 +62,11 @@ class MyDataset(Dataset): else: self.speaker_to_utters[speaker_] = [path_, ] - if self.skip_speakers: - self.speaker_to_utters = {k: v for (k, v) in self.speaker_to_utters.items() if - len(v) >= self.num_utter_per_speaker} + if self.skip_speakers: + self.speaker_to_utters = {k: v for (k, v) in self.speaker_to_utters.items() if + len(v) >= self.num_utter_per_speaker} - self.speakers = [k for (k, v) in self.speaker_to_utters] + self.speakers = [k for (k, v) in self.speaker_to_utters.items()] # def __parse_items(self): # """ diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 4bfad648..40fc66dd 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -2,6 +2,10 @@ import os from glob import glob import re import sys +from pathlib import Path + +from tqdm import tqdm + from TTS.tts.utils.generic_utils import split_dataset @@ -16,6 +20,7 @@ def load_meta_data(datasets): preprocessor = get_preprocessor_by_name(name) meta_data_train = preprocessor(root_path, meta_file_train) + print(f"Found {len(meta_data_train)} files in {Path(root_path).absolute()}") if meta_file_val is None: meta_data_eval, meta_data_train = split_dataset(meta_data_train) else: @@ -187,7 +192,7 @@ def libri_tts(root_path, meta_files=None): cols = line.split('\t') wav_file = os.path.join(_root_path, cols[0] + '.wav') text = cols[1] - items.append([text, wav_file, speaker_name]) + items.append([text, wav_file, 'LTTS_' + speaker_name]) for item in items: assert os.path.exists( item[1]), f" [!] wav files don't exist - {item[1]}" @@ -235,8 +240,7 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" test_speakers = meta_files items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", - recursive=True) + meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) @@ -247,8 +251,50 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): continue with open(meta_file) as file_text: text = file_text.readlines()[0] - wav_file = os.path.join(root_path, wavs_path, speaker_id, + wav_file = os.path.join(root_path, wavs_path, 'VCTK_' + speaker_id, file_id + '.wav') items.append([text, wav_file, speaker_id]) return items + +# ======================================== VOX CELEB =========================================== +def voxceleb2(root_path, meta_file): + """ + :param meta_file Used only for consistency with load_meta_data api + """ + return _voxcel_x(root_path, voxcel_idx="2") + + +def voxceleb1(root_path, meta_file): + """ + :param meta_file Used only for consistency with load_meta_data api + """ + return _voxcel_x(root_path, voxcel_idx="1") + + +def _voxcel_x(root_path, voxcel_idx): + assert voxcel_idx in ["1", "2"] + expected_count = 148_000 if voxcel_idx == "1" else 1_000_000 + voxceleb_path = Path(root_path) + cache_to = voxceleb_path / f"metafile_voxceleb{voxcel_idx}.csv" + cache_to.parent.mkdir(exist_ok=True) + + # if not exists meta file, crawl recursively for 'wav' files + if not cache_to.exists(): + cnt = 0 + meta_data = "" + wav_files = voxceleb_path.rglob("**/*.wav") + for path in tqdm(wav_files, desc=f"Building VoxCeleb {voxcel_idx} Meta file ... this needs to be done only once.", + total=expected_count): + speaker_id = str(Path(path).parent.parent.stem) + assert speaker_id.startswith('id') + text = None # VoxCel does not provide transciptions, and they are not needed for training the SE + meta_data += f"{text}|{path}|voxcel{voxcel_idx}_{speaker_id}\n" + cnt += 1 + with open(str(cache_to), 'w') as f: + f.write(meta_data) + if cnt < expected_count: + raise ValueError(f"Found too few instances for Voxceleb. Should be around {expected_count}, is: {cnt}") + + with open(str(cache_to), 'r') as f: + return [x.strip().split('|') for x in f.readlines()] From 95d2906307b2a3c9d8cce033b11f208da769a461 Mon Sep 17 00:00:00 2001 From: mueller Date: Wed, 16 Sep 2020 16:49:53 +0200 Subject: [PATCH 04/10] add: Mozilla Commonvoice, VoxCeleb1+2, LibriTTS to Speaker Encoder Training --- TTS/speaker_encoder/config.json | 54 ++++++++++++++++++--------------- TTS/tts/datasets/preprocess.py | 12 +++++--- TTS/tts/utils/io.py | 2 +- 3 files changed, 38 insertions(+), 30 deletions(-) diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index 2a063fbf..67a7c40c 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -51,36 +51,42 @@ }, "datasets": [ + { + "name": "common_voice_wav", + "path": "../../audio-datasets/en/MozillaCommonVoice", + "meta_file_train": "train.tsv", + "meta_file_val": "test.tsv" + }, { "name": "voxceleb1", "path": "../../audio-datasets/en/voxceleb1/", "meta_file_train": null, "meta_file_val": null }, -// { -// "name": "voxceleb2", -// "path": "../../audio-datasets/en/voxceleb2/", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "vctk", -// "path": "../../audio-datasets/en/VCTK-Corpus/", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-clean-100", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-clean-360", -// "meta_file_train": null, -// "meta_file_val": null -// }, + { + "name": "voxceleb2", + "path": "../../audio-datasets/en/voxceleb2/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "vctk", + "path": "../../audio-datasets/en/VCTK-Corpus/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-clean-100", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-clean-360", + "meta_file_train": null, + "meta_file_val": null + }, { "name": "libri_tts", "path": "../../audio-datasets/en/LibriTTS/train-other-500", diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 40fc66dd..3bcf416c 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -161,7 +161,7 @@ def nancy(root_path, meta_file): return items -def common_voice(root_path, meta_file): +def common_voice_wav(root_path, meta_file): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -172,8 +172,8 @@ def common_voice(root_path, meta_file): cols = line.split("\t") text = cols[2] speaker_name = cols[0] - wav_file = os.path.join(root_path, "clips", cols[1] + ".wav") - items.append([text, wav_file, speaker_name]) + wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav")) + items.append([text, wav_file, 'MCV_' + speaker_name]) return items @@ -251,9 +251,9 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): continue with open(meta_file) as file_text: text = file_text.readlines()[0] - wav_file = os.path.join(root_path, wavs_path, 'VCTK_' + speaker_id, + wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + '.wav') - items.append([text, wav_file, speaker_id]) + items.append([text, wav_file, 'VCTK_' + speaker_id]) return items @@ -298,3 +298,5 @@ def _voxcel_x(root_path, voxcel_idx): with open(str(cache_to), 'r') as f: return [x.strip().split('|') for x in f.readlines()] + + diff --git a/TTS/tts/utils/io.py b/TTS/tts/utils/io.py index bf5e13d8..78e9b8b2 100644 --- a/TTS/tts/utils/io.py +++ b/TTS/tts/utils/io.py @@ -50,7 +50,7 @@ def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoc if target_loss < best_loss: file_name = 'best_model.pth.tar' checkpoint_path = os.path.join(output_folder, file_name) - print(" > BEST MODEL : {}".format(checkpoint_path)) + print(" >> BEST MODEL : {}".format(checkpoint_path)) save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs) best_loss = target_loss return best_loss From 1511076fde3e27a493c0d505ec7c611af4f01b97 Mon Sep 17 00:00:00 2001 From: mueller Date: Thu, 17 Sep 2020 12:29:38 +0200 Subject: [PATCH 05/10] add: Configurable encoder dataset storage to reduce disk I/O add: Averaged time for data loader to console and Tensorboard output --- TTS/bin/train_encoder.py | 18 ++++-- TTS/speaker_encoder/config.json | 84 +++++++++++++++------------- TTS/speaker_encoder/dataset.py | 18 +++++- TTS/speaker_encoder/generic_utils.py | 6 +- TTS/tts/datasets/preprocess.py | 23 +++++++- 5 files changed, 98 insertions(+), 51 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 6acaeff1..e73e1614 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -44,6 +44,8 @@ def setup_loader(ap, is_val=False, verbose=False): voice_len=1.6, num_utter_per_speaker=10, skip_speakers=False, + storage_size=c.storage["storage_size"], + sample_from_storage_p=c.storage["sample_from_storage_p"], verbose=verbose) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, @@ -60,6 +62,7 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): epoch_time = 0 best_loss = float('inf') avg_loss = 0 + avg_loader_time = 0 end_time = time.time() for _, data in enumerate(data_loader): start_time = time.time() @@ -93,8 +96,12 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): step_time = time.time() - start_time epoch_time += step_time - avg_loss = 0.01 * loss.item( - ) + 0.99 * avg_loss if avg_loss != 0 else loss.item() + # Averaged Loss and Averaged Loader Time + dataset_number_prefetched = 2 * c.num_loader_workers # this is hardcoded in pytorch + avg_loss = 0.01 * loss.item() \ + + 0.99 * avg_loss if avg_loss != 0 else loss.item() + avg_loader_time = 1/dataset_number_prefetched * loader_time\ + + (dataset_number_prefetched-1) / dataset_number_prefetched * avg_loader_time if avg_loader_time != 0 else loader_time current_lr = optimizer.param_groups[0]['lr'] if global_step % c.steps_plot_stats == 0: @@ -103,7 +110,8 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): "loss": avg_loss, "lr": current_lr, "grad_norm": grad_norm, - "step_time": step_time + "step_time": step_time, + "loader_time": loader_time } tb_logger.tb_train_epoch_stats(global_step, train_stats) figures = { @@ -116,9 +124,9 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): if global_step % c.print_step == 0: print( " | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} " - "StepTime:{:.2f} LoaderTime:{:.2f} LR:{:.6f}".format( + "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format( global_step, loss.item(), avg_loss, grad_norm, step_time, - loader_time, current_lr), + loader_time, avg_loader_time, current_lr), flush=True) # save best model diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index 67a7c40c..f350779d 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -23,7 +23,7 @@ "clip_norm": true, // clip normalized values into the range. "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) "trim_db": 60 // threshold for timming silence. Set this according to your dataset. }, "reinit_layers": [], @@ -45,53 +45,57 @@ "model": { "input_dim": 40, "proj_dim": 256, - "lstm_dim": 256, + "lstm_dim": 768, "num_lstm_layers": 3, - "use_lstm_with_projection": false + "use_lstm_with_projection": true + }, + "storage": { + "sample_from_storage_p": 0.42, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 5 // the size of the in-memory storage with respect to a single batch }, "datasets": [ { - "name": "common_voice_wav", - "path": "../../audio-datasets/en/MozillaCommonVoice", - "meta_file_train": "train.tsv", - "meta_file_val": "test.tsv" - }, - { - "name": "voxceleb1", - "path": "../../audio-datasets/en/voxceleb1/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb2", - "path": "../../audio-datasets/en/voxceleb2/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "vctk", + "name": "vctk_slim", "path": "../../audio-datasets/en/VCTK-Corpus/", "meta_file_train": null, "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../audio-datasets/en/LibriTTS/train-clean-100", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../audio-datasets/en/LibriTTS/train-clean-360", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../audio-datasets/en/LibriTTS/train-other-500", - "meta_file_train": null, - "meta_file_val": null } +// { +// "name": "libri_tts", +// "path": "../../audio-datasets/en/LibriTTS/train-clean-100", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "libri_tts", +// "path": "../../audio-datasets/en/LibriTTS/train-clean-360", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "libri_tts", +// "path": "../../audio-datasets/en/LibriTTS/train-other-500", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "voxceleb1", +// "path": "../../audio-datasets/en/voxceleb1/", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "voxceleb2", +// "path": "../../audio-datasets/en/voxceleb2/", +// "meta_file_train": null, +// "meta_file_val": null +// }, +// { +// "name": "common_voice_wav", +// "path": "../../audio-datasets/en/MozillaCommonVoice", +// "meta_file_train": "train.tsv", +// "meta_file_val": "test.tsv" +// } ] } \ No newline at end of file diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 00e5eace..31413e7e 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -1,4 +1,5 @@ import numpy as np +import queue import torch import random from torch.utils.data import Dataset @@ -7,6 +8,7 @@ from tqdm import tqdm class MyDataset(Dataset): def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64, + storage_size=1, sample_from_storage_p=0.5, num_utter_per_speaker=10, skip_speakers=False, verbose=False): """ Args: @@ -25,8 +27,12 @@ class MyDataset(Dataset): self.ap = ap self.verbose = verbose self.__parse_items() + self.storage = queue.Queue(maxsize=storage_size*num_speakers_in_batch) + self.sample_from_storage_p = float(sample_from_storage_p) if self.verbose: print("\n > DataLoader initialization") + print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters") + print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}") print(f" | > Number of instances : {len(self.items)}") print(f" | > Sequence length: {self.seq_len}") print(f" | > Num speakers: {len(self.speakers)}") @@ -134,7 +140,17 @@ class MyDataset(Dataset): labels = [] feats = [] for speaker in batch: - feats_, labels_ = self.__sample_speaker_utterances(speaker) + if random.random() < self.sample_from_storage_p and self.storage.full(): + # sample from storage (if full), ignoring the speaker + feats_, labels_ = random.choice(self.storage.queue) + else: + # don't sample from storage, but from HDD + feats_, labels_ = self.__sample_speaker_utterances(speaker) + # if storage is full, remove an item + if self.storage.full(): + _ = self.storage.get_nowait() + # put the newly loaded item into storage + self.storage.put_nowait((feats_, labels_)) labels.append(labels_) feats.extend(feats_) feats = torch.stack(feats) diff --git a/TTS/speaker_encoder/generic_utils.py b/TTS/speaker_encoder/generic_utils.py index bc72c91c..b3ac00dc 100644 --- a/TTS/speaker_encoder/generic_utils.py +++ b/TTS/speaker_encoder/generic_utils.py @@ -23,7 +23,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path, def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step): - if model_loss < best_loss: + if model_loss < best_loss and current_step > 1000: new_state_dict = model.state_dict() state = { 'model': new_state_dict, @@ -35,7 +35,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, best_loss = model_loss bestmodel_path = 'best_model.pth.tar' bestmodel_path = os.path.join(out_path, bestmodel_path) - print("\n > BEST MODEL ({0:.5f}) : {1:}".format( - model_loss, bestmodel_path)) + print("\n > NEW BEST MODEL ({0:.5f}) : {1:}".format( + model_loss, os.path.abspath(bestmodel_path))) torch.save(state, bestmodel_path) return best_loss diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 3bcf416c..4b2903a0 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -17,10 +17,10 @@ def load_meta_data(datasets): root_path = dataset['path'] meta_file_train = dataset['meta_file_train'] meta_file_val = dataset['meta_file_val'] + print(f" | > Preprocessing {name}") preprocessor = get_preprocessor_by_name(name) - meta_data_train = preprocessor(root_path, meta_file_train) - print(f"Found {len(meta_data_train)} files in {Path(root_path).absolute()}") + print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") if meta_file_val is None: meta_data_eval, meta_data_train = split_dataset(meta_data_train) else: @@ -257,6 +257,25 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): return items + +def vctk_slim(root_path, meta_files=None, wavs_path='wav48'): + test_speakers = meta_files + """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" + items = [] + meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + for meta_file in meta_files: + _, speaker_id, txt_file = os.path.relpath(meta_file, + root_path).split(os.sep) + file_id = txt_file.split('.')[0] + if isinstance(test_speakers, list): # if is list ignore this speakers ids + if speaker_id in test_speakers: + continue + wav_file = os.path.join(root_path, wavs_path, speaker_id, + file_id + '.wav') + items.append([None, wav_file, 'VCTK_' + speaker_id]) + + return items + # ======================================== VOX CELEB =========================================== def voxceleb2(root_path, meta_file): """ From e36a3067e4af08f6990532140e910df911638a88 Mon Sep 17 00:00:00 2001 From: mueller Date: Thu, 17 Sep 2020 14:14:30 +0200 Subject: [PATCH 06/10] add: save wavs instead feats to storage. This is done in order to mitigate staleness when caching and loading from data storage --- TTS/bin/train_encoder.py | 2 +- TTS/speaker_encoder/dataset.py | 20 ++++++++++++-------- TTS/tts/datasets/preprocess.py | 3 +-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index e73e1614..56a2b954 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -111,7 +111,7 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): "lr": current_lr, "grad_norm": grad_norm, "step_time": step_time, - "loader_time": loader_time + "avg_loader_time": avg_loader_time } tb_logger.tb_train_epoch_stats(global_step, train_stats) figures = { diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 31413e7e..3f3db88d 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -110,7 +110,7 @@ class MyDataset(Dataset): """ Sample all M utterances for the given speaker. """ - feats = [] + wavs = [] labels = [] for _ in range(self.num_utter_per_speaker): # TODO:dummy but works @@ -126,11 +126,9 @@ class MyDataset(Dataset): break self.speaker_to_utters[speaker].remove(utter) - offset = random.randint(0, wav.shape[0] - self.seq_len) - mel = self.ap.melspectrogram(wav[offset : offset + self.seq_len]) - feats.append(torch.FloatTensor(mel)) + wavs.append(wav) labels.append(speaker) - return feats, labels + return wavs, labels def __getitem__(self, idx): speaker, _ = self.__sample_speaker() @@ -142,15 +140,21 @@ class MyDataset(Dataset): for speaker in batch: if random.random() < self.sample_from_storage_p and self.storage.full(): # sample from storage (if full), ignoring the speaker - feats_, labels_ = random.choice(self.storage.queue) + wavs_, labels_ = random.choice(self.storage.queue) else: # don't sample from storage, but from HDD - feats_, labels_ = self.__sample_speaker_utterances(speaker) + wavs_, labels_ = self.__sample_speaker_utterances(speaker) # if storage is full, remove an item if self.storage.full(): _ = self.storage.get_nowait() # put the newly loaded item into storage - self.storage.put_nowait((feats_, labels_)) + self.storage.put_nowait((wavs_, labels_)) + + # get a random subset of each of the wavs and convert to MFCC. + offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_] + mels_ = [self.ap.melspectrogram(wavs_[i][offsets_[i]: offsets_[i] + self.seq_len]) for i in range(len(wavs_))] + feats_ = [torch.FloatTensor(mel) for mel in mels_] + labels.append(labels_) feats.extend(feats_) feats = torch.stack(feats) diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 4b2903a0..73a56774 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -17,10 +17,9 @@ def load_meta_data(datasets): root_path = dataset['path'] meta_file_train = dataset['meta_file_train'] meta_file_val = dataset['meta_file_val'] - print(f" | > Preprocessing {name}") preprocessor = get_preprocessor_by_name(name) meta_data_train = preprocessor(root_path, meta_file_train) - print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") + print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") if meta_file_val is None: meta_data_eval, meta_data_train = split_dataset(meta_data_train) else: From a273b1a2102c5928c7e12da233f2ecd86dc8719a Mon Sep 17 00:00:00 2001 From: mueller Date: Thu, 17 Sep 2020 14:23:40 +0200 Subject: [PATCH 07/10] add: add random noise to dataset --- TTS/bin/train_encoder.py | 1 + TTS/speaker_encoder/config.json | 81 +++++++++++++++++---------------- TTS/speaker_encoder/dataset.py | 9 +++- 3 files changed, 50 insertions(+), 41 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 56a2b954..dae3ebac 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -46,6 +46,7 @@ def setup_loader(ap, is_val=False, verbose=False): skip_speakers=False, storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], + additive_noise=c.storage["additive_noise"], verbose=verbose) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index f350779d..d7c959cf 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -27,7 +27,7 @@ "trim_db": 60 // threshold for timming silence. Set this according to your dataset. }, "reinit_layers": [], - "loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) + "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) "grad_clip": 3.0, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. @@ -35,12 +35,12 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. + "print_step": 20, // Number of steps to log traning on console. "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. "model": { "input_dim": 40, @@ -51,7 +51,8 @@ }, "storage": { "sample_from_storage_p": 0.42, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 5 // the size of the in-memory storage with respect to a single batch + "storage_size": 5, // the size of the in-memory storage with respect to a single batch + "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness }, "datasets": [ @@ -60,42 +61,42 @@ "path": "../../audio-datasets/en/VCTK-Corpus/", "meta_file_train": null, "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-clean-100", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-clean-360", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-other-500", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "voxceleb1", + "path": "../../audio-datasets/en/voxceleb1/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "voxceleb2", + "path": "../../audio-datasets/en/voxceleb2/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "common_voice_wav", + "path": "../../audio-datasets/en/MozillaCommonVoice", + "meta_file_train": "train.tsv", + "meta_file_val": "test.tsv" } -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-clean-100", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-clean-360", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-other-500", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "voxceleb1", -// "path": "../../audio-datasets/en/voxceleb1/", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "voxceleb2", -// "path": "../../audio-datasets/en/voxceleb2/", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "common_voice_wav", -// "path": "../../audio-datasets/en/MozillaCommonVoice", -// "meta_file_train": "train.tsv", -// "meta_file_val": "test.tsv" -// } ] } \ No newline at end of file diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 3f3db88d..05709080 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -1,3 +1,4 @@ +import numpy import numpy as np import queue import torch @@ -8,7 +9,7 @@ from tqdm import tqdm class MyDataset(Dataset): def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64, - storage_size=1, sample_from_storage_p=0.5, + storage_size=1, sample_from_storage_p=0.5, additive_noise=0, num_utter_per_speaker=10, skip_speakers=False, verbose=False): """ Args: @@ -29,6 +30,7 @@ class MyDataset(Dataset): self.__parse_items() self.storage = queue.Queue(maxsize=storage_size*num_speakers_in_batch) self.sample_from_storage_p = float(sample_from_storage_p) + self.additive_noise = float(additive_noise) if self.verbose: print("\n > DataLoader initialization") print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters") @@ -150,6 +152,11 @@ class MyDataset(Dataset): # put the newly loaded item into storage self.storage.put_nowait((wavs_, labels_)) + # add random gaussian noise + if self.additive_noise > 0: + noises_ = [numpy.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_] + wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))] + # get a random subset of each of the wavs and convert to MFCC. offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_] mels_ = [self.ap.melspectrogram(wavs_[i][offsets_[i]: offsets_[i] + self.seq_len]) for i in range(len(wavs_))] From 6b0621c794fb686c87b7268e3141147072c3058f Mon Sep 17 00:00:00 2001 From: mueller Date: Thu, 17 Sep 2020 16:46:43 +0200 Subject: [PATCH 08/10] cleanup --- TTS/bin/train_encoder.py | 8 ++++---- TTS/speaker_encoder/config.json | 7 ++++--- TTS/speaker_encoder/dataset.py | 2 ++ TTS/tts/utils/generic_utils.py | 9 ++++++--- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index dae3ebac..3222c278 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -42,7 +42,8 @@ def setup_loader(ap, is_val=False, verbose=False): dataset = MyDataset(ap, meta_data_eval if is_val else meta_data_train, voice_len=1.6, - num_utter_per_speaker=10, + num_utter_per_speaker=c.num_utters_per_speaker, + num_speakers_in_batch=c.num_speakers_in_batch, skip_speakers=False, storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], @@ -98,11 +99,10 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): epoch_time += step_time # Averaged Loss and Averaged Loader Time - dataset_number_prefetched = 2 * c.num_loader_workers # this is hardcoded in pytorch avg_loss = 0.01 * loss.item() \ + 0.99 * avg_loss if avg_loss != 0 else loss.item() - avg_loader_time = 1/dataset_number_prefetched * loader_time\ - + (dataset_number_prefetched-1) / dataset_number_prefetched * avg_loader_time if avg_loader_time != 0 else loader_time + avg_loader_time = 1/c.num_loader_workers * loader_time + \ + (c.num_loader_workers-1) / c.num_loader_workers * avg_loader_time if avg_loader_time != 0 else loader_time current_lr = optimizer.param_groups[0]['lr'] if global_step % c.steps_plot_stats == 0: diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index d7c959cf..332f58bb 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -36,7 +36,8 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "steps_plot_stats": 10, // number of steps to plot embeddings. "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_utters_per_speaker": 10, // + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. @@ -50,8 +51,8 @@ "use_lstm_with_projection": true }, "storage": { - "sample_from_storage_p": 0.42, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 5, // the size of the in-memory storage with respect to a single batch + "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 15, // the size of the in-memory storage with respect to a single batch "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness }, "datasets": diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 05709080..38757ce9 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -33,8 +33,10 @@ class MyDataset(Dataset): self.additive_noise = float(additive_noise) if self.verbose: print("\n > DataLoader initialization") + print(f" | > Speakers per Batch: {num_speakers_in_batch}") print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters") print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}") + print(f" | > Noise added : {self.additive_noise}") print(f" | > Number of instances : {len(self.items)}") print(f" | > Sequence length: {self.seq_len}") print(f" | > Num speakers: {len(self.speakers)}") diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index 6eaa2358..6358e5a9 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -7,11 +7,9 @@ from TTS.utils.generic_utils import check_argument def split_dataset(items): - is_multi_speaker = False speakers = [item[-1] for item in items] is_multi_speaker = len(set(speakers)) > 1 - eval_split_size = 500 if len(items) * 0.01 > 500 else int( - len(items) * 0.01) + eval_split_size = min(500, int(len(items) * 0.01)) assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." np.random.seed(0) np.random.shuffle(items) @@ -142,6 +140,11 @@ def check_config(c): check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) check_argument('trim_db', c['audio'], restricted=True, val_type=int) + # storage parameters + check_argument('sample_from_storage_p', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) + check_argument('storage_size', c['storage'], restricted=True, val_type=int, min_val=1, max_val=100) + check_argument('additive_noise', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) + # training parameters check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) From 45b3c3d1b039d0e3a34ade3df7b4e1c7fc145cf5 Mon Sep 17 00:00:00 2001 From: mueller91 Date: Mon, 21 Sep 2020 11:57:38 +0200 Subject: [PATCH 09/10] fix: Update common_voice.tsv and test_preprocessors.py to current .tsv format (common_voice.tsv is the first 6 lines of the dev.tsv) --- tests/inputs/common_voice.tsv | 16 ++++++---------- tests/test_preprocessors.py | 18 ++++-------------- 2 files changed, 10 insertions(+), 24 deletions(-) diff --git a/tests/inputs/common_voice.tsv b/tests/inputs/common_voice.tsv index a6ea30dd..39fc4190 100644 --- a/tests/inputs/common_voice.tsv +++ b/tests/inputs/common_voice.tsv @@ -1,10 +1,6 @@ -client_id path sentence up_votes down_votes age gender accent -aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 21fce545b24d9a5af0403b949e95e8dd3c10c4ff3e371f14e4d5b4ebf588670b7c9e618285fc872d94a89ed7f0217d9019fe5de33f1577b49dcd518eacf63c4b Man sollte den Länderfinanzausgleich durch einen Bundesliga-Soli ersetzen. 2 0 fourties male germany -aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 42758baa4e91ef6b82b78b11a04bc5117a035a8d3bc42c33c0bb3084909af17043a194cfd8cd9839f0d6ef1ea5413acda5de5d1936abcc8ca073e2da7f9488ea Folgende Lektüre kann ich Ihnen zum Thema Kognitionspsychologie empfehlen. 2 0 fourties male germany -aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 478f172c2dbda6675247e9674ade79a5b49efeefb7c9e99040dcc69a847a01d69398cf180570859b0cdb6fc887717e04cd8b149c723d48d00b5d18f41314667c Touristen winkten den Leuten am Ufer zu. 2 0 fourties male germany -aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 4854368d6d21cb44103e432b5332f31e8d14030582a40850501bcf9377d699314a5ff27a8206fa89254ddde7f3f1c65d33836f3dfcfa16bcabec08537f2b5f08 Valentin hat das Handtuch geworfen. 2 0 fourties male germany -aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 a841a9f3e032495dd47560e65fba99eeacb3618c07de8b1351c20188e5b71e33cc52f73315f721a3a24b65763c65bb52fbf3ae052eb5774e834dcb57f296db5c Ohne Gehörschutz bei der Arbeit wäre Klaus wohl nach zwei Wochen taub. 2 0 fourties male germany -aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 03ab970a5bf5410bc3260b073cce1c7f49c688ace83dc8836b1c0f79a09fea45a27725c769f4a9d2e6181defd016d22642789d7ac51da252b42958a9192bd4c7 Gerrit erinnerte sich daran, dass er einst einen Eid geschworen hatte. 2 0 fourties male germany -aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 c4a94df443ad5f2c7241413ef7145d5f0de41ae929759073917fe96166da3c7d3a612c920ed7b0f3d5950a38d6205e9dba24af5bfb27e390a220d004e6e26744 Auf das, was jetzt kommt, habe ich nämlich absolut keinen Bock. 2 0 fourties male germany -aa7af576605fee2c78c26b85497c64cb9c9fd97228071f8666d9f49f15bce01899bbb930fa60b76d212091d779d83b92e0b54c73cbb21d2c7e1eedc817e41cb3 104695983b1112229b4a48696405d044dad9ddef713aa6eb1a6240cc16b7b7a2a96354ae9da99783850dde08a982091e48d3037288a3a58269cac9fe70a6bd7a Von Salzburg ist es doch nicht weit bis zum Chiemsee. 2 0 fourties male germany -d5b5da343bb0f65e3580bc2e1902a4f5d004241488d751503f2020bc1c93f89715e355e35f6e25def2b90cb3eea99fda403eb92ae3afbb84d039a54a4ed2d875 ad2f69e053b0e20e01c82b9821fe5787f1cc8e4b0b97f0e4cab1e9a652c577169c8244fb222281a60ee3081854014113e04c4ca43643100b7c01dab0fac11974 Warum werden da keine strafrechtlichen Konsequenzen gezogen? 2 0 thirties male germany +client_id path sentence up_votes down_votes age gender accent locale segment +95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005954.mp3 The applicants are invited for coffee and visa is given immediately. 3 0 en +95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005955.mp3 Developmental robotics is related to, but differs from, evolutionary robotics. 2 0 en +95324d489b122a800b840e0b0d068f7363a1a6c2cd2e7365672cc7033e38deaa794bd59edcf8196aa35c9791652b9085ac3839a98bb50ebab4a1e8538a94846b common_voice_en_20005956.mp3 The musical was originally directed and choreographed by Alan Lund. 2 0 en +954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737073.mp3 He graduated from Columbia High School, in Brown County, South Dakota. 2 0 en +954a4181ae9fba89d1b1570f2ae148b3ee18ee2311de978e698f598db859f830d93d35574596d713518e8c96cdae01fce7a08c60c2e0a22bcf01e020924440a6 common_voice_en_19737074.mp3 Competition for limited resources has also resulted in some local conflicts. 2 0 en diff --git a/tests/test_preprocessors.py b/tests/test_preprocessors.py index 1c7ad46f..8c7b16b0 100644 --- a/tests/test_preprocessors.py +++ b/tests/test_preprocessors.py @@ -11,18 +11,8 @@ class TestPreprocessors(unittest.TestCase): root_path = get_tests_input_path() meta_file = "common_voice.tsv" items = common_voice(root_path, meta_file) - assert items[0][0] == "Man sollte den Länderfinanzausgleich durch " \ - "einen Bundesliga-Soli ersetzen." - assert items[0][1] == os.path.join(get_tests_input_path(), "clips", - "21fce545b24d9a5af0403b949e95e8dd3" - "c10c4ff3e371f14e4d5b4ebf588670b7c" - "9e618285fc872d94a89ed7f0217d9019f" - "e5de33f1577b49dcd518eacf63c4b.wav") + assert items[0][0] == 'The applicants are invited for coffee and visa is given immediately.' + assert items[0][1] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_20005954.wav") - assert items[-1][0] == "Warum werden da keine strafrechtlichen " \ - "Konsequenzen gezogen?" - assert items[-1][1] == os.path.join(get_tests_input_path(), "clips", - "ad2f69e053b0e20e01c82b9821fe5787f1" - "cc8e4b0b97f0e4cab1e9a652c577169c82" - "44fb222281a60ee3081854014113e04c4c" - "a43643100b7c01dab0fac11974.wav") + assert items[-1][0] == "Competition for limited resources has also resulted in some local conflicts." + assert items[-1][1] == os.path.join(get_tests_input_path(), "clips", "common_voice_en_19737074.wav") From 9b4aac94a824898bbec135b930f954914f3a19c1 Mon Sep 17 00:00:00 2001 From: mueller91 Date: Mon, 21 Sep 2020 12:13:02 +0200 Subject: [PATCH 10/10] fix: linter issues --- TTS/speaker_encoder/config.json | 16 ++++++++-------- TTS/speaker_encoder/dataset.py | 1 - TTS/tts/datasets/preprocess.py | 31 ++++++++++++++++--------------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index 332f58bb..4fbd84cc 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -59,43 +59,43 @@ [ { "name": "vctk_slim", - "path": "../../audio-datasets/en/VCTK-Corpus/", + "path": "../../../audio-datasets/en/VCTK-Corpus/", "meta_file_train": null, "meta_file_val": null }, { "name": "libri_tts", - "path": "../../audio-datasets/en/LibriTTS/train-clean-100", + "path": "../../../audio-datasets/en/LibriTTS/train-clean-100", "meta_file_train": null, "meta_file_val": null }, { "name": "libri_tts", - "path": "../../audio-datasets/en/LibriTTS/train-clean-360", + "path": "../../../audio-datasets/en/LibriTTS/train-clean-360", "meta_file_train": null, "meta_file_val": null }, { "name": "libri_tts", - "path": "../../audio-datasets/en/LibriTTS/train-other-500", + "path": "../../../audio-datasets/en/LibriTTS/train-other-500", "meta_file_train": null, "meta_file_val": null }, { "name": "voxceleb1", - "path": "../../audio-datasets/en/voxceleb1/", + "path": "../../../audio-datasets/en/voxceleb1/", "meta_file_train": null, "meta_file_val": null }, { "name": "voxceleb2", - "path": "../../audio-datasets/en/voxceleb2/", + "path": "../../../audio-datasets/en/voxceleb2/", "meta_file_train": null, "meta_file_val": null }, { - "name": "common_voice_wav", - "path": "../../audio-datasets/en/MozillaCommonVoice", + "name": "common_voice", + "path": "../../../audio-datasets/en/MozillaCommonVoice", "meta_file_train": "train.tsv", "meta_file_val": "test.tsv" } diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 38757ce9..33cc4f36 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -64,7 +64,6 @@ class MyDataset(Dataset): def __parse_items(self): self.speaker_to_utters = {} for i in self.items: - text_ = i[0] path_ = i[1] speaker_ = i[2] if speaker_ in self.speaker_to_utters.keys(): diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 73a56774..02954c04 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -160,7 +160,7 @@ def nancy(root_path, meta_file): return items -def common_voice_wav(root_path, meta_file): +def common_voice(root_path, meta_file): """Normalize the common voice meta data file to TTS format.""" txt_file = os.path.join(root_path, meta_file) items = [] @@ -258,16 +258,15 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'): def vctk_slim(root_path, meta_files=None, wavs_path='wav48'): - test_speakers = meta_files """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" items = [] - meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) - for meta_file in meta_files: - _, speaker_id, txt_file = os.path.relpath(meta_file, + txt_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) + for text_file in txt_files: + _, speaker_id, txt_file = os.path.relpath(text_file, root_path).split(os.sep) file_id = txt_file.split('.')[0] - if isinstance(test_speakers, list): # if is list ignore this speakers ids - if speaker_id in test_speakers: + if isinstance(meta_files, list): # if is list ignore this speakers ids + if speaker_id in meta_files: continue wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + '.wav') @@ -276,21 +275,21 @@ def vctk_slim(root_path, meta_files=None, wavs_path='wav48'): return items # ======================================== VOX CELEB =========================================== -def voxceleb2(root_path, meta_file): +def voxceleb2(root_path, meta_file=None): """ :param meta_file Used only for consistency with load_meta_data api """ - return _voxcel_x(root_path, voxcel_idx="2") + return _voxcel_x(root_path, meta_file, voxcel_idx="2") -def voxceleb1(root_path, meta_file): +def voxceleb1(root_path, meta_file=None): """ :param meta_file Used only for consistency with load_meta_data api """ - return _voxcel_x(root_path, voxcel_idx="1") + return _voxcel_x(root_path, meta_file, voxcel_idx="1") -def _voxcel_x(root_path, voxcel_idx): +def _voxcel_x(root_path, meta_file, voxcel_idx): assert voxcel_idx in ["1", "2"] expected_count = 148_000 if voxcel_idx == "1" else 1_000_000 voxceleb_path = Path(root_path) @@ -298,7 +297,11 @@ def _voxcel_x(root_path, voxcel_idx): cache_to.parent.mkdir(exist_ok=True) # if not exists meta file, crawl recursively for 'wav' files - if not cache_to.exists(): + if meta_file is not None: + with open(str(meta_file), 'r') as f: + return [x.strip().split('|') for x in f.readlines()] + + elif not cache_to.exists(): cnt = 0 meta_data = "" wav_files = voxceleb_path.rglob("**/*.wav") @@ -316,5 +319,3 @@ def _voxcel_x(root_path, voxcel_idx): with open(str(cache_to), 'r') as f: return [x.strip().split('|') for x in f.readlines()] - -