From 14b209c7e9b9ba336d65152ddf45482ea47b2408 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sat, 5 Jun 2021 03:12:17 -0300 Subject: [PATCH 01/10] Create a batch for more fast inference on LSTM Speaker Encoder --- TTS/speaker_encoder/models/lstm.py | 33 +++++++++++++++++++----------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/TTS/speaker_encoder/models/lstm.py b/TTS/speaker_encoder/models/lstm.py index 05a56675..fadada70 100644 --- a/TTS/speaker_encoder/models/lstm.py +++ b/TTS/speaker_encoder/models/lstm.py @@ -1,4 +1,5 @@ import torch +import numpy as np from torch import nn @@ -70,24 +71,32 @@ class LSTMSpeakerEncoder(nn.Module): d = torch.nn.functional.normalize(d, p=2, dim=1) return d - def compute_embedding(self, x, num_frames=160, overlap=0.5): + def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): """ Generate embeddings for a batch of utterances x: 1xTxD """ - num_overlap = int(num_frames * overlap) max_len = x.shape[1] - embed = None - cur_iter = 0 - for offset in range(0, max_len, num_frames - num_overlap): - cur_iter += 1 - end_offset = min(x.shape[1], offset + num_frames) + + if max_len < num_frames: + num_frames = max_len + + offsets = np.linspace(0, max_len-num_frames, num=num_eval) + + frames_batch = [] + for offset in offsets: + offset = int(offset) + end_offset = int(offset+num_frames) frames = x[:, offset:end_offset] - if embed is None: - embed = self.inference(frames) - else: - embed += self.inference(frames) - return embed / cur_iter + frames_batch.append(frames) + + frames_batch = torch.cat(frames_batch, dim=0) + embeddings = self.inference(frames_batch) + + if return_mean: + embeddings = torch.mean(embeddings, dim=0, keepdim=True) + + return embeddings def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5): """ From b0aa18934870cb0120703346c766325af81135bc Mon Sep 17 00:00:00 2001 From: Adam Froghyar Date: Mon, 14 Jun 2021 10:44:00 +0200 Subject: [PATCH 02/10] Forcing do_trim_silence to False in the extract TTS script --- TTS/bin/extract_tts_spectrograms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index ace7464a..4eb79d76 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -299,4 +299,5 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) + c.audio['do_trim_silence'] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel main(args) From d85ee901d57b4a08301ef569d3c48dd032508ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 15 Jun 2021 10:53:53 +0200 Subject: [PATCH 03/10] Fix #571 --- TTS/bin/extract_tts_spectrograms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index ace7464a..2be9d760 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -299,4 +299,5 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) + C.audio['do_trim_silence'] = False main(args) From b74b510d3c9d7dd31e16c1dc5379ea85948e771f Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 18 Jun 2021 14:04:49 -0300 Subject: [PATCH 04/10] Compute embeddings and find characters using config file --- TTS/bin/compute_embeddings.py | 67 ++++++++++++---------------------- TTS/bin/find_unique_chars.py | 25 +++++++------ TTS/tts/datasets/preprocess.py | 34 +++++++++++++---- 3 files changed, 63 insertions(+), 63 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 003da1e5..9ed459a2 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -3,71 +3,44 @@ import glob import os import torch +import numpy as np from tqdm import tqdm from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor -from TTS.config import load_config, BaseDatasetConfig +from TTS.config import load_config parser = argparse.ArgumentParser( - description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.' + description='Compute embedding vectors for each wav file in a dataset.' ) -parser.add_argument("model_path", type=str, help="Path to model outputs (checkpoint, tensorboard etc.).") +parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") parser.add_argument( "config_path", type=str, - help="Path to config file for training.", + help="Path to model config file.", ) -parser.add_argument("data_path", type=str, help="Data path for wav files - directory or CSV file") -parser.add_argument("output_path", type=str, help="path for output speakers.json.") + parser.add_argument( - "--target_dataset", + "config_dataset_path", type=str, - default="", - help="Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.", + help="Path to dataset config file.", ) +parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.") parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) -parser.add_argument("--separator", type=str, help="Separator used in file if CSV is passed for data_path", default="|") +parser.add_argument("--save_npy", type=bool, help="flag to set cuda.", default=False) args = parser.parse_args() c = load_config(args.config_path) +c_dataset = load_config(args.config_dataset_path) + ap = AudioProcessor(**c["audio"]) -data_path = args.data_path -split_ext = os.path.splitext(data_path) -sep = args.separator +train_files, dev_files = load_meta_data(c_dataset.datasets, eval_split=True, ignore_generated_eval=True) -if args.target_dataset != "": - # if target dataset is defined - dataset_config = [ - BaseDatasetConfig(name=args.target_dataset, path=args.data_path, meta_file_train=None, meta_file_val=None), - ] - wav_files, _ = load_meta_data(dataset_config, eval_split=False) -else: - # if target dataset is not defined - if len(split_ext) > 0 and split_ext[1].lower() == ".csv": - # Parse CSV - print(f"CSV file: {data_path}") - with open(data_path) as f: - wav_path = os.path.join(os.path.dirname(data_path), "wavs") - wav_files = [] - print(f"Separator is: {sep}") - for line in f: - components = line.split(sep) - if len(components) != 2: - print("Invalid line") - continue - wav_file = os.path.join(wav_path, components[0] + ".wav") - # print(f'wav_file: {wav_file}') - if os.path.exists(wav_file): - wav_files.append(wav_file) - print(f"Count of wavs imported: {len(wav_files)}") - else: - # Parse all wav files in data_path - wav_files = glob.glob(data_path + "/**/*.wav", recursive=True) +wav_files = train_files + dev_files # define Encoder model model = setup_model(c) @@ -100,11 +73,19 @@ for idx, wav_file in enumerate(tqdm(wav_files)): if speaker_mapping: # save speaker_mapping if target dataset is defined - if '.json' not in args.output_path: + if '.json' not in args.output_path and '.npy' not in args.output_path: mapping_file_path = os.path.join(args.output_path, "speakers.json") + mapping_npy_file_path = os.path.join(args.output_path, "speakers.npy") else: - mapping_file_path = args.output_path + mapping_file_path = args.output_path.replace(".npy", ".json") + mapping_npy_file_path = mapping_file_path.replace(".json", ".npy") + os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True) + + if args.save_npy: + np.save(mapping_npy_file_path, speaker_mapping) + print("Speaker embeddings saved at:", mapping_npy_file_path) + speaker_manager = SpeakerManager() # pylint: disable=W0212 speaker_manager._save_json(mapping_file_path, speaker_mapping) diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 7891d65a..8fbc8f8e 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -2,40 +2,41 @@ import argparse import os from argparse import RawTextHelpFormatter - -from TTS.tts.datasets.preprocess import get_preprocessor_by_name +from TTS.tts.datasets.preprocess import load_meta_data +from TTS.config import load_config def main(): # pylint: disable=bad-option-value parser = argparse.ArgumentParser( description="""Find all the unique characters or phonemes in a dataset.\n\n""" - """Target dataset must be defined in TTS.tts.datasets.preprocess\n\n""" + """\n\n""" """ Example runs: - python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv + python TTS/bin/find_unique_chars.py --config_path config.json """, formatter_class=RawTextHelpFormatter, ) - parser.add_argument( - "--dataset", type=str, default="", help="One of the target dataset names in TTS.tts.datasets.preprocess." + "--config_path", type=str, help="Path to dataset config file.", required=True ) - - parser.add_argument("--meta_file", type=str, default=None, help="Path to the transcriptions file of the dataset.") - args = parser.parse_args() - preprocessor = get_preprocessor_by_name(args.dataset) - items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file)) + c = load_config(args.config_path) + # load all datasets + train_items, dev_items = load_meta_data(c.datasets, eval_split=True, ignore_generated_eval=True) + items = train_items + dev_items + texts = "".join(item[0] for item in items) chars = set(texts) lower_chars = filter(lambda c: c.islower(), chars) + chars_force_lower = set([c.lower() for c in chars]) + print(f" > Number of unique characters: {len(chars)}") print(f" > Unique characters: {''.join(sorted(chars))}") print(f" > Unique lower characters: {''.join(sorted(lower_chars))}") - + print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}") if __name__ == "__main__": main() diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 72ab160e..7fbc01b8 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -37,7 +37,7 @@ def split_dataset(items): return items[:eval_split_size], items[eval_split_size:] -def load_meta_data(datasets, eval_split=True): +def load_meta_data(datasets, eval_split=True, ignore_generated_eval=False): meta_data_train_all = [] meta_data_eval_all = [] if eval_split else None for dataset in datasets: @@ -54,9 +54,11 @@ def load_meta_data(datasets, eval_split=True): if eval_split: if meta_file_val: meta_data_eval = preprocessor(root_path, meta_file_val) - else: + meta_data_eval_all += meta_data_eval + elif not ignore_generated_eval: meta_data_eval, meta_data_train = split_dataset(meta_data_train) - meta_data_eval_all += meta_data_eval + meta_data_eval_all += meta_data_eval + meta_data_train_all += meta_data_train # load attention masks for duration predictor training if dataset.meta_file_attn_mask: @@ -270,16 +272,20 @@ def libri_tts(root_path, meta_files=None): items = [] if meta_files is None: meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True) + else: + if isinstance(meta_files, str): + meta_files = [os.path.join(root_path, meta_files)] + for meta_file in meta_files: _meta_file = os.path.basename(meta_file).split(".")[0] - speaker_name = _meta_file.split("_")[0] - chapter_id = _meta_file.split("_")[1] - _root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}") with open(meta_file, "r") as ttf: for line in ttf: cols = line.split("\t") - wav_file = os.path.join(_root_path, cols[0] + ".wav") - text = cols[1] + file_name = cols[0] + speaker_name, chapter_id, *_ = cols[0].split("_") + _root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}") + wav_file = os.path.join(_root_path, file_name + ".wav") + text = cols[2] items.append([text, wav_file, "LTTS_" + speaker_name]) for item in items: assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" @@ -355,6 +361,18 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48"): return items +def mls(root_path, meta_files=None): + """http://www.openslr.org/94/""" + items = [] + with open(os.path.join(root_path, meta_files), "r") as meta: + isTrain = "train" in meta_files + for line in meta: + file, text = line.split('\t') + text = text[:-1] + speaker, book, no = file.split('_') + wav_file = os.path.join(root_path, "train" if isTrain else "dev", 'audio', speaker, book, file + ".wav") + items.append([text, wav_file, "MLS_" + speaker]) + return items # ======================================== VOX CELEB =========================================== def voxceleb2(root_path, meta_file=None): From 83644056e368d17e69eaf00b9ef7999bd6d3cfd5 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 18 Jun 2021 14:32:28 -0300 Subject: [PATCH 05/10] fix Lint checks --- TTS/bin/compute_embeddings.py | 4 ++-- TTS/bin/find_unique_chars.py | 4 ++-- TTS/tts/datasets/preprocess.py | 5 ++--- tests/test_speaker_encoder.py | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 9ed459a2..ab5754f7 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -1,16 +1,16 @@ import argparse -import glob import os import torch import numpy as np from tqdm import tqdm +from TTS.config import load_config from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor -from TTS.config import load_config + parser = argparse.ArgumentParser( description='Compute embedding vectors for each wav file in a dataset.' diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 8fbc8f8e..fccbc311 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -1,6 +1,5 @@ """Find all the unique characters in a dataset""" import argparse -import os from argparse import RawTextHelpFormatter from TTS.tts.datasets.preprocess import load_meta_data from TTS.config import load_config @@ -31,7 +30,8 @@ def main(): texts = "".join(item[0] for item in items) chars = set(texts) lower_chars = filter(lambda c: c.islower(), chars) - chars_force_lower = set([c.lower() for c in chars]) + chars_force_lower = [c.lower() for c in chars]) + chars_force_lower = set(chars_force_lower) print(f" > Number of unique characters: {len(chars)}") print(f" > Unique characters: {''.join(sorted(chars))}") diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 7fbc01b8..23d3f3c1 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -365,12 +365,11 @@ def mls(root_path, meta_files=None): """http://www.openslr.org/94/""" items = [] with open(os.path.join(root_path, meta_files), "r") as meta: - isTrain = "train" in meta_files for line in meta: file, text = line.split('\t') text = text[:-1] - speaker, book, no = file.split('_') - wav_file = os.path.join(root_path, "train" if isTrain else "dev", 'audio', speaker, book, file + ".wav") + speaker, book, *_ = file.split('_') + wav_file = os.path.join(root_path, os.path.dirname(meta_files), 'audio', speaker, book, file + ".wav") items.append([text, wav_file, "MLS_" + speaker]) return items diff --git a/tests/test_speaker_encoder.py b/tests/test_speaker_encoder.py index f56a9577..cecbd493 100644 --- a/tests/test_speaker_encoder.py +++ b/tests/test_speaker_encoder.py @@ -34,7 +34,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase): assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}" # compute d for a given batch dummy_input = T.rand(1, 240, 80) # B x T x D - output = model.compute_embedding(dummy_input, num_frames=160, overlap=0.5) + output = model.compute_embedding(dummy_input, num_frames=160, num_eval=5) assert output.shape[0] == 1 assert output.shape[1] == 256 assert len(output.shape) == 2 From 99d40e98d99ca9c7e848c3b5b19fea1d067c8788 Mon Sep 17 00:00:00 2001 From: Edresson Date: Fri, 18 Jun 2021 14:59:01 -0300 Subject: [PATCH 06/10] fix Lint checks --- TTS/bin/compute_embeddings.py | 1 - TTS/bin/find_unique_chars.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 00a20bdf..5332123d 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -6,7 +6,6 @@ import numpy as np from tqdm import tqdm from TTS.config import load_config -from TTS.config import BaseDatasetConfig, load_config from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.utils.speakers import SpeakerManager diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index fccbc311..8ac73235 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -30,7 +30,7 @@ def main(): texts = "".join(item[0] for item in items) chars = set(texts) lower_chars = filter(lambda c: c.islower(), chars) - chars_force_lower = [c.lower() for c in chars]) + chars_force_lower = [c.lower() for c in chars] chars_force_lower = set(chars_force_lower) print(f" > Number of unique characters: {len(chars)}") From 1c4e806f549923169056ae90c51795ffae772f65 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 27 Jun 2021 03:35:34 -0300 Subject: [PATCH 07/10] use speaker manager on compute embeddings script --- TTS/bin/compute_embeddings.py | 39 +++++----------------------- TTS/speaker_encoder/models/lstm.py | 4 ++- TTS/speaker_encoder/models/resnet.py | 9 +++++++ TTS/tts/utils/speakers.py | 12 ++++++--- 4 files changed, 27 insertions(+), 37 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 5332123d..e843150b 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -1,15 +1,11 @@ import argparse import os -import torch -import numpy as np from tqdm import tqdm from TTS.config import load_config -from TTS.speaker_encoder.utils.generic_utils import setup_model from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.utils.speakers import SpeakerManager -from TTS.utils.audio import AudioProcessor parser = argparse.ArgumentParser( description='Compute embedding vectors for each wav file in a dataset.' @@ -28,25 +24,14 @@ parser.add_argument( ) parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.") parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) -parser.add_argument("--save_npy", type=bool, help="flag to set cuda.", default=False) args = parser.parse_args() - -c = load_config(args.config_path) c_dataset = load_config(args.config_dataset_path) -ap = AudioProcessor(**c["audio"]) - train_files, dev_files = load_meta_data(c_dataset.datasets, eval_split=True, ignore_generated_eval=True) - wav_files = train_files + dev_files -# define Encoder model -model = setup_model(c) -model.load_state_dict(torch.load(args.model_path)["model"]) -model.eval() -if args.use_cuda: - model.cuda() +speaker_manager = SpeakerManager(encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda) # compute speaker embeddings speaker_mapping = {} @@ -57,36 +42,24 @@ for idx, wav_file in enumerate(tqdm(wav_files)): else: speaker_name = None - mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T - mel_spec = torch.FloatTensor(mel_spec[None, :, :]) - if args.use_cuda: - mel_spec = mel_spec.cuda() - embedd = model.compute_embedding(mel_spec) - embedd = embedd.detach().cpu().numpy() + # extract the embedding + embedd = speaker_manager.compute_x_vector_from_clip(wav_file) # create speaker_mapping if target dataset is defined wav_file_name = os.path.basename(wav_file) speaker_mapping[wav_file_name] = {} speaker_mapping[wav_file_name]["name"] = speaker_name - speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist() + speaker_mapping[wav_file_name]["embedding"] = embedd if speaker_mapping: # save speaker_mapping if target dataset is defined - if '.json' not in args.output_path and '.npy' not in args.output_path: - + if '.json' not in args.output_path: mapping_file_path = os.path.join(args.output_path, "speakers.json") - mapping_npy_file_path = os.path.join(args.output_path, "speakers.npy") else: - mapping_file_path = args.output_path.replace(".npy", ".json") - mapping_npy_file_path = mapping_file_path.replace(".json", ".npy") + mapping_file_path = args.output_path os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True) - if args.save_npy: - np.save(mapping_npy_file_path, speaker_mapping) - print("Speaker embeddings saved at:", mapping_npy_file_path) - - speaker_manager = SpeakerManager() # pylint: disable=W0212 speaker_manager._save_json(mapping_file_path, speaker_mapping) print("Speaker embeddings saved at:", mapping_file_path) diff --git a/TTS/speaker_encoder/models/lstm.py b/TTS/speaker_encoder/models/lstm.py index fadada70..21439d6b 100644 --- a/TTS/speaker_encoder/models/lstm.py +++ b/TTS/speaker_encoder/models/lstm.py @@ -119,9 +119,11 @@ class LSTMSpeakerEncoder(nn.Module): return embed / num_iters # pylint: disable=unused-argument, redefined-builtin - def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False): + def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): state = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.load_state_dict(state["model"]) + if use_cuda: + self.cuda() if eval: self.eval() assert not self.training diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/speaker_encoder/models/resnet.py index ce86b01f..29f3ae61 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/speaker_encoder/models/resnet.py @@ -199,3 +199,12 @@ class ResNetSpeakerEncoder(nn.Module): embeddings = torch.mean(embeddings, dim=0, keepdim=True) return embeddings + + def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): + state = torch.load(checkpoint_path, map_location=torch.device("cpu")) + self.load_state_dict(state["model"]) + if use_cuda: + self.cuda() + if eval: + self.eval() + assert not self.training \ No newline at end of file diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 84da1f72..1b8c054d 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -133,6 +133,7 @@ class SpeakerManager: speaker_id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", + use_cuda: bool = False, ): self.x_vectors = None @@ -140,6 +141,7 @@ class SpeakerManager: self.clip_ids = None self.speaker_encoder = None self.speaker_encoder_ap = None + self.use_cuda = use_cuda if x_vectors_file_path: self.load_x_vectors_file(x_vectors_file_path) @@ -215,17 +217,19 @@ class SpeakerManager: def init_speaker_encoder(self, model_path: str, config_path: str) -> None: self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_model(self.speaker_encoder_config) - self.speaker_encoder.load_checkpoint(config_path, model_path, True) + self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) # normalize the input audio level and trim silences - self.speaker_encoder_ap.do_sound_norm = True - self.speaker_encoder_ap.do_trim_silence = True + # self.speaker_encoder_ap.do_sound_norm = True + # self.speaker_encoder_ap.do_trim_silence = True def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: def _compute(wav_file: str): waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) spec = self.speaker_encoder_ap.melspectrogram(waveform) spec = torch.from_numpy(spec.T) + if self.use_cuda: + spec = spec.cuda() spec = spec.unsqueeze(0) x_vector = self.speaker_encoder.compute_embedding(spec) return x_vector @@ -248,6 +252,8 @@ class SpeakerManager: feats = torch.from_numpy(feats) if feats.ndim == 2: feats = feats.unsqueeze(0) + if self.use_cuda: + feats = feats.cuda() return self.speaker_encoder.compute_embedding(feats) def run_umap(self): From d906fea08cf96c95d78fa01cdc935fc6a87c6685 Mon Sep 17 00:00:00 2001 From: Edresson Date: Tue, 13 Jul 2021 02:15:31 -0300 Subject: [PATCH 08/10] lint fix and eval as argparse in extract tts spectrograms --- TTS/bin/compute_embeddings.py | 5 +---- TTS/bin/extract_tts_spectrograms.py | 3 ++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 19bfbe3a..7719318a 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -1,13 +1,10 @@ import argparse -import glob import os -import torch from tqdm import tqdm -from TTS.config import BaseDatasetConfig, load_config -from TTS.speaker_encoder.utils.generic_utils import setup_model +from TTS.config import load_config from TTS.tts.datasets import load_meta_data from TTS.tts.utils.speakers import SpeakerManager diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 0bd84db1..0e783c2f 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -227,7 +227,7 @@ def main(args): # pylint: disable=redefined-outer-name ap = AudioProcessor(**c.audio) # load data instances - meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=True, ignore_generated_eval=True) + meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=args.eval, ignore_generated_eval=True) # use eval and training partitions meta_data = meta_data_train + meta_data_eval @@ -271,6 +271,7 @@ if __name__ == "__main__": parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug") parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files") parser.add_argument("--quantized", action="store_true", help="Save quantized audio files") + parser.add_argument("--eval", type=bool, help="compute eval.", default=True) args = parser.parse_args() c = load_config(args.config_path) From b1620d1f3f517821507991e1a85a412ab636c4ce Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 15 Jul 2021 03:34:28 -0300 Subject: [PATCH 09/10] remove ignore generate eval flag --- TTS/bin/compute_embeddings.py | 4 ++-- TTS/bin/extract_tts_spectrograms.py | 2 +- TTS/bin/find_unique_chars.py | 2 +- TTS/tts/datasets/__init__.py | 8 +++----- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 7719318a..7ea1e4f9 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -32,8 +32,8 @@ args = parser.parse_args() c_dataset = load_config(args.config_dataset_path) -train_files, dev_files = load_meta_data(c_dataset.datasets, eval_split=args.eval, ignore_generated_eval=True) -wav_files = train_files + dev_files +meta_data_train, meta_data_eval = load_meta_data(c_dataset.datasets, eval_split=args.eval) +wav_files = meta_data_train + meta_data_eval speaker_manager = SpeakerManager(encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 0e783c2f..1cbc5516 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -227,7 +227,7 @@ def main(args): # pylint: disable=redefined-outer-name ap = AudioProcessor(**c.audio) # load data instances - meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=args.eval, ignore_generated_eval=True) + meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=args.eval) # use eval and training partitions meta_data = meta_data_train + meta_data_eval diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 6273b752..c7c25d80 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -24,7 +24,7 @@ def main(): c = load_config(args.config_path) # load all datasets - train_items, eval_items = load_meta_data(c.datasets, eval_split=True, ignore_generated_eval=True) + train_items, eval_items = load_meta_data(c.datasets, eval_split=True) items = train_items + eval_items texts = "".join(item[0] for item in items) diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 736d6ed4..cbae78a7 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -30,7 +30,7 @@ def split_dataset(items): return items[:eval_split_size], items[eval_split_size:] -def load_meta_data(datasets, eval_split=True, ignore_generated_eval=False): +def load_meta_data(datasets, eval_split=True): meta_data_train_all = [] meta_data_eval_all = [] if eval_split else None for dataset in datasets: @@ -47,11 +47,9 @@ def load_meta_data(datasets, eval_split=True, ignore_generated_eval=False): if eval_split: if meta_file_val: meta_data_eval = preprocessor(root_path, meta_file_val) - meta_data_eval_all += meta_data_eval - elif not ignore_generated_eval: + else: meta_data_eval, meta_data_train = split_dataset(meta_data_train) - meta_data_eval_all += meta_data_eval - + meta_data_eval_all += meta_data_eval meta_data_train_all += meta_data_train # load attention masks for duration predictor training if dataset.meta_file_attn_mask: From d5adc35fdfd87a417431a549358711220ad971e1 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Wed, 21 Jul 2021 07:16:10 -0300 Subject: [PATCH 10/10] Add docstring to compute_embeddings script --- TTS/bin/compute_embeddings.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 7ea1e4f9..f485514a 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -4,13 +4,18 @@ import os from tqdm import tqdm +from argparse import RawTextHelpFormatter from TTS.config import load_config from TTS.tts.datasets import load_meta_data from TTS.tts.utils.speakers import SpeakerManager - parser = argparse.ArgumentParser( - description='Compute embedding vectors for each wav file in a dataset.' + description="""Compute embedding vectors for each wav file in a dataset.\n\n""" + """ + Example runs: + python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json dataset_config.json embeddings_output_path/ + """, + formatter_class=RawTextHelpFormatter, ) parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") parser.add_argument(