From c670365507e55ec857505792358d4d43b063fe95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 8 Mar 2022 14:20:14 +0100 Subject: [PATCH 01/38] Fix VCTK recipe and formatter --- TTS/tts/datasets/formatters.py | 4 ++-- recipes/vctk/vits/train_vits.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index aacfc647..fa8d79bc 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -328,7 +328,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic else: wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}") if os.path.exists(wav_file): - items.append([text, wav_file, "VCTK_" + speaker_id]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id}) else: print(f" [!] wav files don't exist - {wav_file}") return items @@ -348,7 +348,7 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48"): with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") - items.append([text, wav_file, "VCTK_old_" + speaker_id]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id}) return items diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index dff4eefc..84e8a058 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -53,6 +53,7 @@ config = VitsConfig( epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, + phoneme_language="en", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), compute_input_seq_cache=True, print_step=25, From d792b78703b644bdf1f1a06686df73d3da7b90a8 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Wed, 9 Mar 2022 12:18:17 -0300 Subject: [PATCH 02/38] Fix multilingual recipe (#1354) --- .../multilingual/vits_tts/train_vits_tts.py | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index ac2c21a2..3f35275a 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -6,9 +6,11 @@ from trainer import Trainer, TrainerArgs from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig +from TTS.tts.models.vits import CharactersConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.vits import Vits, VitsArgs from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor @@ -73,15 +75,16 @@ config = VitsConfig( max_audio_len=160000, output_path=output_path, datasets=dataset_config, - characters={ - "pad": "_", - "eos": "&", - "bos": "*", - "characters": "!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„", - "punctuations": "!¡'(),-.:;¿? ", - "phonemes": None, - "unique": True, - }, + characters=CharactersConfig( + characters_class="TTS.tts.models.vits.VitsCharacters", + pad="", + eos="", + bos="", + blank="", + characters="!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„", + punctuations="!¡'(),-.:;¿? ", + phonemes=None, + ), test_sentences=[ [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", @@ -100,6 +103,9 @@ config = VitsConfig( ], ) +# force the convertion of the custom characters to a config attribute +config.from_dict(config.to_dict()) + # init audio processor ap = AudioProcessor(**config.audio.to_dict()) @@ -115,8 +121,13 @@ config.model_args.num_speakers = speaker_manager.num_speakers language_manager = LanguageManager(config=config) config.model_args.num_languages = language_manager.num_languages +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# config is updated with the default characters if not defined in the config. +tokenizer, config = TTSTokenizer.init_from_config(config) + # init model -model = Vits(config, speaker_manager, language_manager) +model = Vits(config, ap, tokenizer, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( From 48f6bb405ac90295368ec53329e87055fbf809bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 10 Mar 2022 11:36:38 +0100 Subject: [PATCH 03/38] Fix recipes as to the recent API changes. (#1367) * Fix recipes -> #1366 * Fix trainer docs --- docs/source/main_classes/trainer_api.md | 16 +--------------- recipes/ljspeech/hifigan/train_hifigan.py | 5 ++--- .../multiband_melgan/train_multiband_melgan.py | 5 ++--- .../ljspeech/tacotron2-DDC/train_tacotron_ddc.py | 3 +-- recipes/ljspeech/univnet/train.py | 5 ++--- recipes/multilingual/vits_tts/train_vits_tts.py | 3 +-- 6 files changed, 9 insertions(+), 28 deletions(-) diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md index a5c3cfb7..f765fff7 100644 --- a/docs/source/main_classes/trainer_api.md +++ b/docs/source/main_classes/trainer_api.md @@ -1,17 +1,3 @@ # Trainer API -The {class}`TTS.trainer.Trainer` provides a lightweight, extensible, and feature-complete training run-time. We optimized it for 🐸 but -can also be used for any DL training in different domains. It supports distributed multi-gpu, mixed-precision (apex or torch.amp) training. - - -## Trainer -```{eval-rst} -.. autoclass:: TTS.trainer.Trainer - :members: -``` - -## TrainingArgs -```{eval-rst} -.. autoclass:: TTS.trainer.TrainingArgs - :members: -``` \ No newline at end of file +We made the trainer a seprate project on https://github.com/coqui-ai/Trainer diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py index 1e5bbf30..6a739009 100644 --- a/recipes/ljspeech/hifigan/train_hifigan.py +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -37,7 +37,7 @@ ap = AudioProcessor(**config.audio.to_dict()) eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) # init model -model = GAN(config) +model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( @@ -46,7 +46,6 @@ trainer = Trainer( output_path, model=model, train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py index 40ff5a00..d5ca9a76 100644 --- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -37,7 +37,7 @@ ap = AudioProcessor(**config.audio.to_dict()) eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) # init model -model = GAN(config) +model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( @@ -46,7 +46,6 @@ trainer = Trainer( output_path, model=model, train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py index d00f8ed7..a0ff8b02 100644 --- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py @@ -89,7 +89,6 @@ trainer = Trainer( output_path, model=model, train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py index 19c91925..592b9a76 100644 --- a/recipes/ljspeech/univnet/train.py +++ b/recipes/ljspeech/univnet/train.py @@ -36,7 +36,7 @@ ap = AudioProcessor(**config.audio.to_dict()) eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) # init model -model = GAN(config) +model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( @@ -45,7 +45,6 @@ trainer = Trainer( output_path, model=model, train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index 3f35275a..c4ed0dda 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -136,7 +136,6 @@ trainer = Trainer( output_path, model=model, train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + eval_samples=eval_samples ) trainer.fit() From 8a007c8834a6212513e6757a04b29558c534ddf5 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 10 Mar 2022 18:40:06 +0800 Subject: [PATCH 04/38] feat: add docsqa to docs website (#1363) --- docs/source/_templates/page.html | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 docs/source/_templates/page.html diff --git a/docs/source/_templates/page.html b/docs/source/_templates/page.html new file mode 100644 index 00000000..b86c33e7 --- /dev/null +++ b/docs/source/_templates/page.html @@ -0,0 +1,23 @@ +{% extends "!page.html" %} +{% block scripts %} + {{ super() }} + + + + + + + +{% endblock %} From 07d96f7991b99ae74e6b2bfe167d0bfa1753bf64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 10 Mar 2022 12:17:06 +0100 Subject: [PATCH 05/38] Fix DocQA title --- docs/source/_templates/page.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/_templates/page.html b/docs/source/_templates/page.html index b86c33e7..aab3d977 100644 --- a/docs/source/_templates/page.html +++ b/docs/source/_templates/page.html @@ -3,10 +3,10 @@ {{ super() }} - - From f381e29b912ba85732bde026a011b74b7731aa0f Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Thu, 10 Mar 2022 10:54:51 -0300 Subject: [PATCH 06/38] REBASED: Add support for the speaker encoder training using torch spectrograms (#1348) * Add support for the speaker encoder training using torch spectrograms * Remove useless function in speaker encoder dataset class --- TTS/bin/train_encoder.py | 1 + TTS/speaker_encoder/dataset.py | 26 ++++++++------------------ TTS/tts/datasets/formatters.py | 10 +++++----- 3 files changed, 14 insertions(+), 23 deletions(-) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 5828411c..b7424698 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -46,6 +46,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False sample_from_storage_p=c.storage["sample_from_storage_p"], verbose=verbose, augmentation_config=c.audio_augmentation, + use_torch_spec=c.model_params.get("use_torch_spec", False), ) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 28a23e2f..07fa9246 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -20,6 +20,7 @@ class SpeakerEncoderDataset(Dataset): skip_speakers=False, verbose=False, augmentation_config=None, + use_torch_spec=None, ): """ Args: @@ -37,6 +38,7 @@ class SpeakerEncoderDataset(Dataset): self.skip_speakers = skip_speakers self.ap = ap self.verbose = verbose + self.use_torch_spec = use_torch_spec self.__parse_items() storage_max_size = storage_size * num_speakers_in_batch self.storage = Storage( @@ -72,22 +74,6 @@ class SpeakerEncoderDataset(Dataset): audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) return audio - def load_data(self, idx): - text, wav_file, speaker_name = self.items[idx] - wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - # sample seq_len - - assert text.size > 0, self.items[idx]["audio_file"] - assert wav.size > 0, self.items[idx]["audio_file"] - - sample = { - "mel": mel, - "item_idx": self.items[idx]["audio_file"], - "speaker_name": speaker_name, - } - return sample - def __parse_items(self): self.speaker_to_utters = {} for i in self.items: @@ -241,8 +227,12 @@ class SpeakerEncoderDataset(Dataset): self.gaussian_augmentation_config["max_amplitude"], size=len(wav), ) - mel = self.ap.melspectrogram(wav) - feats_.append(torch.FloatTensor(mel)) + + if not self.use_torch_spec: + mel = self.ap.melspectrogram(wav) + feats_.append(torch.FloatTensor(mel)) + else: + feats_.append(torch.FloatTensor(wav)) labels.append(torch.LongTensor(labels_)) feats.extend(feats_) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index fa8d79bc..ac3080c3 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -334,21 +334,21 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic return items -def vctk_old(root_path, meta_files=None, wavs_path="wav48"): +def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" - test_speakers = meta_files items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] - if isinstance(test_speakers, list): # if is list ignore this speakers ids - if speaker_id in test_speakers: + # ignore speakers + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: continue with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") - items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id}) + items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id}) return items From 917f417ac46e8e5da2e37f4a4b3c555fb97e3b16 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Thu, 10 Mar 2022 10:56:09 -0300 Subject: [PATCH 07/38] Add alphas to control language and speaker balancer (#1216) * Add alphas to control language and speaker balancer * Add docs for speaker and language samplers * Change the Samplers weights to float for save memory * Change the test_samplers to unittest format * Add get_sampler method in BaseTTS * Fix rebase issues * Add language and speaker samplers support for DDP training * Rename distributed sampler wrapper * Remove the DistributedSamplerWrapper and use the one from Trainer * Bugfix after rebase * Move the samplers config to tts config --- TTS/config/shared_configs.py | 1 - TTS/tts/configs/shared_configs.py | 17 +++++ TTS/tts/models/base_tts.py | 58 +++++++++------- TTS/tts/models/vits.py | 27 ++------ TTS/tts/utils/languages.py | 10 +-- TTS/tts/utils/speakers.py | 9 +-- tests/data_tests/test_samplers.py | 66 +++++++++++++------ .../test_vits_multilingual_train-d_vectors.py | 9 ++- 8 files changed, 121 insertions(+), 76 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 6394b264..3ea49796 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -258,4 +258,3 @@ class BaseTrainingConfig(TrainerConfig): num_loader_workers: int = 0 num_eval_loader_workers: int = 0 use_noise_augment: bool = False - use_language_weighted_sampler: bool = False diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index f43c6464..a9b56ed4 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -220,6 +220,18 @@ class BaseTTSConfig(BaseTrainingConfig): eval_split_size (float): If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set. If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%). + + use_speaker_weighted_sampler (bool): + Enable / Disable the batch balancer by speaker. Defaults to ```False```. + + speaker_weighted_sampler_alpha (float): + Number that control the influence of the speaker sampler weights. Defaults to ```1.0```. + + use_language_weighted_sampler (bool): + Enable / Disable the batch balancer by language. Defaults to ```False```. + + language_weighted_sampler_alpha (float): + Number that control the influence of the language sampler weights. Defaults to ```1.0```. """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) @@ -262,3 +274,8 @@ class BaseTTSConfig(BaseTrainingConfig): # evaluation eval_split_max_size: int = None eval_split_size: float = 0.01 + # weighted samplers + use_speaker_weighted_sampler: bool = False + speaker_weighted_sampler_alpha: float = 1.0 + use_language_weighted_sampler: bool = False + language_weighted_sampler_alpha: float = 1.0 diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 4e54b947..222f8519 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -7,14 +7,15 @@ import torch.distributed as dist from coqpit import Coqpit from torch import nn from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler +from trainer.torch import DistributedSampler, DistributedSamplerWrapper from TTS.model import BaseTrainerModel from TTS.tts.datasets.dataset import TTSDataset -from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_weighted_sampler +from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_balancer_weights from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment, plot_spectrogram +from torch.utils.data.sampler import WeightedRandomSampler # pylint: skip-file @@ -232,6 +233,36 @@ class BaseTTS(BaseTrainerModel): "language_ids": language_ids, } + def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): + weights = None + data_items = dataset.samples + + if getattr(config, "use_language_weighted_sampler", False): + alpha = getattr(config, "language_weighted_sampler_alpha", 1.0) + print(" > Using Language weighted sampler with alpha:", alpha) + weights = get_language_balancer_weights(data_items) * alpha + + if getattr(config, "use_speaker_weighted_sampler", False): + alpha = getattr(config, "speaker_weighted_sampler_alpha", 1.0) + print(" > Using Speaker weighted sampler with alpha:", alpha) + if weights is not None: + weights += get_speaker_balancer_weights(data_items) * alpha + else: + weights = get_speaker_balancer_weights(data_items) * alpha + + if weights is not None: + sampler = WeightedRandomSampler(weights, len(weights)) + else: + sampler = None + + # sampler for DDP + if sampler is None: + sampler = DistributedSampler(dataset) if num_gpus > 1 else None + else: # If a sampler is already defined use this sampler and DDP sampler together + sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler + + return sampler + def get_data_loader( self, config: Coqpit, @@ -300,25 +331,8 @@ class BaseTTS(BaseTrainerModel): # sort input sequences from short to long dataset.preprocess_samples() - # sampler for DDP - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - - # Weighted samplers - # TODO: make this DDP amenable - assert not ( - num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False) - ), "language_weighted_sampler is not supported with DistributedSampler" - assert not ( - num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False) - ), "speaker_weighted_sampler is not supported with DistributedSampler" - - if sampler is None: - if getattr(config, "use_language_weighted_sampler", False): - print(" > Using Language weighted sampler") - sampler = get_language_weighted_sampler(dataset.samples) - elif getattr(config, "use_speaker_weighted_sampler", False): - print(" > Using Language weighted sampler") - sampler = get_speaker_weighted_sampler(dataset.samples) + # get samplers + sampler = self.get_sampler(config, dataset, num_gpus) loader = DataLoader( dataset, diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index a43e081c..6aa30dfe 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -13,7 +13,6 @@ from torch import nn from torch.cuda.amp.autocast_mode import autocast from torch.nn import functional as F from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler from trainer.trainer_utils import get_optimizer, get_scheduler from TTS.tts.configs.shared_configs import CharactersConfig @@ -24,8 +23,8 @@ from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlock from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask -from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_weighted_sampler +from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations from TTS.tts.utils.text.tokenizer import TTSTokenizer @@ -1354,31 +1353,15 @@ class Vits(BaseTTS): # sort input sequences from short to long dataset.preprocess_samples() - # sampler for DDP - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - - # Weighted samplers - # TODO: make this DDP amenable - assert not ( - num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False) - ), "language_weighted_sampler is not supported with DistributedSampler" - assert not ( - num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False) - ), "speaker_weighted_sampler is not supported with DistributedSampler" - - if sampler is None: - if getattr(config, "use_language_weighted_sampler", False): - print(" > Using Language weighted sampler") - sampler = get_language_weighted_sampler(dataset.samples) - elif getattr(config, "use_speaker_weighted_sampler", False): - print(" > Using Language weighted sampler") - sampler = get_speaker_weighted_sampler(dataset.samples) + # get samplers + sampler = self.get_sampler(config, dataset, num_gpus) loader = DataLoader( dataset, batch_size=config.eval_batch_size if is_eval else config.batch_size, shuffle=False, # shuffle is done in the dataset. drop_last=False, # setting this False might cause issues in AMP training. + sampler=sampler, collate_fn=dataset.collate_fn, num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, pin_memory=False, diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 19708c13..7decabb0 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -6,7 +6,6 @@ import fsspec import numpy as np import torch from coqpit import Coqpit -from torch.utils.data.sampler import WeightedRandomSampler from TTS.config import check_config_and_model_args @@ -128,11 +127,14 @@ def _set_file_path(path): return None -def get_language_weighted_sampler(items: list): +def get_language_balancer_weights(items: list): language_names = np.array([item["language"] for item in items]) unique_language_names = np.unique(language_names).tolist() language_ids = [unique_language_names.index(l) for l in language_names] language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names]) weight_language = 1.0 / language_count - dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double() - return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) + # get weight for each sample + dataset_samples_weight = np.array([weight_language[l] for l in language_ids]) + # normalize + dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) + return torch.from_numpy(dataset_samples_weight).float() diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 99d653e6..078ce3f1 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -7,7 +7,6 @@ import fsspec import numpy as np import torch from coqpit import Coqpit -from torch.utils.data.sampler import WeightedRandomSampler from TTS.config import get_from_config_or_model_args_with_default, load_config from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model @@ -449,11 +448,13 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, return speaker_manager -def get_speaker_weighted_sampler(items: list): +def get_speaker_balancer_weights(items: list): speaker_names = np.array([item["speaker_name"] for item in items]) unique_speaker_names = np.unique(speaker_names).tolist() speaker_ids = [unique_speaker_names.index(l) for l in speaker_names] speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names]) weight_speaker = 1.0 / speaker_count - dataset_samples_weight = torch.from_numpy(np.array([weight_speaker[l] for l in speaker_ids])).double() - return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) + dataset_samples_weight = np.array([weight_speaker[l] for l in speaker_ids]) + # normalize + dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) + return torch.from_numpy(dataset_samples_weight).float() diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index 497a3fb5..12152fb8 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -1,10 +1,13 @@ import functools +import unittest + import torch from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.utils.languages import get_language_weighted_sampler +from TTS.tts.utils.languages import get_language_balancer_weights +from TTS.tts.utils.speakers import get_speaker_balancer_weights # Fixing random state to avoid random fails torch.manual_seed(0) @@ -25,34 +28,57 @@ dataset_config_pt = BaseDatasetConfig( language="pt-br", ) -# Adding the EN samples twice to create an unbalanced dataset +# Adding the EN samples twice to create a language unbalanced dataset train_samples, eval_samples = load_tts_samples( [dataset_config_en, dataset_config_en, dataset_config_pt], eval_split=True ) +# gerenate a speaker unbalanced dataset +for i, sample in enumerate(train_samples): + if i < 5: + sample["speaker_name"] = "ljspeech-0" + else: + sample["speaker_name"] = "ljspeech-1" + def is_balanced(lang_1, lang_2): return 0.85 < lang_1 / lang_2 < 1.2 -random_sampler = torch.utils.data.RandomSampler(train_samples) -ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) -en, pt = 0, 0 -for index in ids: - if train_samples[index]["language"] == "en": - en += 1 - else: - pt += 1 +class TestSamplers(unittest.TestCase): + def test_language_random_sampler(self): # pylint: disable=no-self-use + random_sampler = torch.utils.data.RandomSampler(train_samples) + ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) + en, pt = 0, 0 + for index in ids: + if train_samples[index]["language"] == "en": + en += 1 + else: + pt += 1 -assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" + assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" -weighted_sampler = get_language_weighted_sampler(train_samples) -ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) -en, pt = 0, 0 -for index in ids: - if train_samples[index]["language"] == "en": - en += 1 - else: - pt += 1 + def test_language_weighted_random_sampler(self): # pylint: disable=no-self-use + weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_language_balancer_weights(train_samples), len(train_samples)) + ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) + en, pt = 0, 0 + for index in ids: + if train_samples[index]["language"] == "en": + en += 1 + else: + pt += 1 -assert is_balanced(en, pt), "Weighted sampler is supposed to be balanced" + assert is_balanced(en, pt), "Language Weighted sampler is supposed to be balanced" + + def test_speaker_weighted_random_sampler(self): # pylint: disable=no-self-use + + weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_speaker_balancer_weights(train_samples), len(train_samples)) + ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) + spk1, spk2 = 0, 0 + for index in ids: + if train_samples[index]["speaker_name"] == "ljspeech-0": + spk1 += 1 + else: + spk2 += 1 + + assert is_balanced(spk1, spk2), "Speaker Weighted sampler is supposed to be balanced" diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index a8e2020e..e12661a5 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -45,7 +45,7 @@ config = VitsConfig( ["Be a voice, not an echo.", "ljspeech-0", None, "en"], ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], ], - datasets=[dataset_config_en, dataset_config_pt], + datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt], ) # set audio config config.audio.do_trim_silence = True @@ -71,8 +71,11 @@ config.d_vector_dim = 256 config.model_args.use_sdp = True config.use_sdp = True -# deactivate language sampler -config.use_language_weighted_sampler = False +# activate language and speaker samplers +config.use_language_weighted_sampler = True +config.language_weighted_sampler_alpha = 10 +config.use_speaker_weighted_sampler = True +config.speaker_weighted_sampler_alpha = 5 config.save_json(config_path) From dbe9da7f15544b83043f481a99e5bcb23e002dc9 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Thu, 10 Mar 2022 10:57:12 -0300 Subject: [PATCH 08/38] Add Voice conversion inference support (#1337) * Add support for voice conversion inference * Cache d_vectors_by_speaker for fast inference using a bigger speakers.json * Rebase bug fix * Use the average d-vector for inference --- TTS/bin/synthesize.py | 20 ++++-- TTS/tts/models/vits.py | 24 ++++++- TTS/tts/utils/speakers.py | 19 +++++- TTS/tts/utils/synthesis.py | 85 +++++++++++++++++++++++ TTS/utils/synthesizer.py | 134 ++++++++++++++++++++++++++++--------- 5 files changed, 241 insertions(+), 41 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 509b3da6..fe31c510 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -195,11 +195,22 @@ If you don't specify any models, then it uses LJSpeech based English model. help="If true save raw spectogram for further (vocoder) processing in out_path.", default=False, ) - + parser.add_argument( + "--reference_wav", + type=str, + help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav", + default=None, + ) + parser.add_argument( + "--reference_speaker_idx", + type=str, + help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).", + default=None, + ) args = parser.parse_args() # print the description if either text or list_models is not set - if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs: + if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav: parser.parse_args(["-h"]) # load model manager @@ -281,10 +292,11 @@ If you don't specify any models, then it uses LJSpeech based English model. return # RUN THE SYNTHESIS - print(" > Text: {}".format(args.text)) + if args.text: + print(" > Text: {}".format(args.text)) # kick it - wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav) + wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx) # save the results print(" > Saving output to {}".format(args.out_path)) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 6aa30dfe..818b9a54 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -994,6 +994,25 @@ class Vits(BaseTTS): outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p} return outputs + @torch.no_grad() + def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None): + """Inference for voice conversion + + Args: + reference_wav (Tensor): Reference wavform. Tensor of shape [B, T] + speaker_id (Tensor): speaker_id of the target speaker. Tensor of shape [B] + d_vector (Tensor): d_vector embedding of target speaker. Tensor of shape `[B, C]` + reference_speaker_id (Tensor): speaker_id of the reference_wav speaker. Tensor of shape [B] + reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]` + """ + # compute spectrograms + y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2) + y_lengths = torch.tensor([y.size(-1)]).to(y.device) + speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector + speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector + # print(y.shape, y_lengths.shape) + wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt) + return wav def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): """Forward pass for voice conversion @@ -1007,12 +1026,11 @@ class Vits(BaseTTS): speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,] """ assert self.num_speakers > 0, "num_speakers have to be larger than 0." - # speaker embedding if self.args.use_speaker_embedding and not self.args.use_d_vector_file: g_src = self.emb_g(speaker_cond_src).unsqueeze(-1) g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1) - elif self.args.use_speaker_embedding and self.args.use_d_vector_file: + elif not self.args.use_speaker_embedding and self.args.use_d_vector_file: g_src = F.normalize(speaker_cond_src).unsqueeze(-1) g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) else: @@ -1199,7 +1217,7 @@ class Vits(BaseTTS): if speaker_name is None: d_vector = self.speaker_manager.get_random_d_vector() else: - d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=1, randomize=False) + d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False) elif config.use_speaker_embedding: if speaker_name is None: speaker_id = self.speaker_manager.get_random_speaker_id() diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 078ce3f1..c15a3abf 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -65,6 +65,7 @@ class SpeakerManager: self.d_vectors = {} self.speaker_ids = {} + self.d_vectors_by_speakers = {} self.clip_ids = [] self.speaker_encoder = None self.speaker_encoder_ap = None @@ -166,6 +167,8 @@ class SpeakerManager: self.speaker_ids = {name: i for i, name in enumerate(speakers)} self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) + # cache d_vectors_by_speakers for fast inference using a bigger speakers.json + self.d_vectors_by_speakers = self.get_d_vectors_by_speakers() def get_d_vector_by_clip(self, clip_idx: str) -> List: """Get d_vector by clip ID. @@ -187,7 +190,21 @@ class SpeakerManager: Returns: List[List]: all the d_vectors of the given speaker. """ - return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] + return self.d_vectors_by_speakers[speaker_idx] + + def get_d_vectors_by_speakers(self) -> Dict: + """Get all d_vectors by speaker. + + Returns: + Dict: all the d_vectors of each speaker. + """ + d_vectors_by_speakers = {} + for x in self.d_vectors.values(): + if x["name"] not in d_vectors_by_speakers.keys(): + d_vectors_by_speakers[x["name"]] = [x["embedding"]] + else: + d_vectors_by_speakers[x["name"]].append(x["embedding"]) + return d_vectors_by_speakers def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: """Get mean d_vector of a speaker ID. diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index b6e19ab4..582fb4f1 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -205,3 +205,88 @@ def synthesis( "outputs": outputs, } return return_dict + +def transfer_voice( + model, + CONFIG, + use_cuda, + reference_wav, + speaker_id=None, + d_vector=None, + reference_speaker_id=None, + reference_d_vector=None, + do_trim_silence=False, + use_griffin_lim=False, +): + """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to + the vocoder model. + + Args: + model (TTS.tts.models): + The TTS model to synthesize audio with. + + CONFIG (Coqpit): + Model configuration. + + use_cuda (bool): + Enable/disable CUDA. + + reference_wav (str): + Path of reference_wav to be used to voice conversion. + + speaker_id (int): + Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. + + d_vector (torch.Tensor): + d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. + + reference_speaker_id (int): + Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. + + reference_d_vector (torch.Tensor): + Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. + + enable_eos_bos_chars (bool): + enable special chars for end of sentence and start of sentence. Defaults to False. + + do_trim_silence (bool): + trim silence after synthesis. Defaults to False. + """ + # pass tensors to backend + if speaker_id is not None: + speaker_id = id_to_torch(speaker_id, cuda=use_cuda) + + if d_vector is not None: + d_vector = embedding_to_torch(d_vector, cuda=use_cuda) + + if reference_d_vector is not None: + reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda) + + # load reference_wav audio + reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda) + + if hasattr(model, "module"): + _func = model.module.inference_voice_conversion + else: + _func = model.inference_voice_conversion + model_outputs = _func( + reference_wav, + speaker_id, + d_vector, + reference_speaker_id, + reference_d_vector) + + # convert outputs to numpy + # plot results + wav = None + model_outputs = model_outputs.squeeze() + if model_outputs.ndim == 2: # [T, C_spec] + if use_griffin_lim: + wav = inv_spectrogram(model_outputs, model.ap, CONFIG) + # trim silence + if do_trim_silence: + wav = trim_silence(wav, model.ap) + else: # [T,] + wav = model_outputs + + return wav diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index d1abc907..687794b4 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -10,7 +10,7 @@ from TTS.tts.models import setup_model as setup_tts_model # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import -from TTS.tts.utils.synthesis import synthesis, trim_silence +from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence from TTS.utils.audio import AudioProcessor from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input @@ -114,10 +114,14 @@ class Synthesizer(object): if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config() + self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() + if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): + self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) + def _set_speaker_encoder_paths_from_tts_config(self): """Set the encoder paths from the tts model config for models with speaker encoders.""" if hasattr(self.tts_config, "model_args") and hasattr( @@ -169,11 +173,13 @@ class Synthesizer(object): def tts( self, - text: str, + text: str = "", speaker_name: str = "", language_name: str = "", speaker_wav: Union[str, List[str]] = None, style_wav=None, + reference_wav=None, + reference_speaker_name=None, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -183,15 +189,23 @@ class Synthesizer(object): language_name (str, optional): language id for multi-language models. Defaults to "". speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. style_wav ([type], optional): style waveform for GST. Defaults to None. - + reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. + reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None. Returns: List[int]: [description] """ start_time = time.time() wavs = [] - sens = self.split_into_sentences(text) - print(" > Text splitted to sentences.") - print(sens) + + if not text and not reference_wav: + raise ValueError( + "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." + ) + + if text: + sens = self.split_into_sentences(text) + print(" > Text splitted to sentences.") + print(sens) # handle multi-speaker speaker_embedding = None @@ -199,8 +213,8 @@ class Synthesizer(object): if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: - # get the speaker embedding from the saved d_vectors. - speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_name)[0] + # get the average speaker embedding from the saved d_vectors. + speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False) speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name @@ -209,7 +223,7 @@ class Synthesizer(object): elif not speaker_name and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " - "You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model." + "You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model." ) else: speaker_embedding = None @@ -246,22 +260,83 @@ class Synthesizer(object): use_gl = self.vocoder_model is None - for sen in sens: - # synthesize voice - outputs = synthesis( - model=self.tts_model, - text=sen, - CONFIG=self.tts_config, - use_cuda=self.use_cuda, - speaker_id=speaker_id, - language_id=language_id, - style_wav=style_wav, - use_griffin_lim=use_gl, - d_vector=speaker_embedding, - ) - waveform = outputs["wav"] - mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy() + if not reference_wav: + for sen in sens: + # synthesize voice + outputs = synthesis( + model=self.tts_model, + text=sen, + CONFIG=self.tts_config, + use_cuda=self.use_cuda, + speaker_id=speaker_id, + language_id=language_id, + style_wav=style_wav, + use_griffin_lim=use_gl, + d_vector=speaker_embedding, + ) + waveform = outputs["wav"] + mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy() + if not use_gl: + # denormalize tts output based on tts audio config + mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T + device_type = "cuda" if self.use_cuda else "cpu" + # renormalize spectrogram based on vocoder config + vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) + # compute scale factor for possible sample rate mismatch + scale_factor = [ + 1, + self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, + ] + if scale_factor[1] != 1: + print(" > interpolating tts model output.") + vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) + else: + vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable + # run vocoder model + # [1, T, C] + waveform = self.vocoder_model.inference(vocoder_input.to(device_type)) + if self.use_cuda and not use_gl: + waveform = waveform.cpu() + if not use_gl: + waveform = waveform.numpy() + waveform = waveform.squeeze() + + # trim silence + if self.tts_config.audio["do_trim_silence"] is True: + waveform = trim_silence(waveform, self.tts_model.ap) + + wavs += list(waveform) + wavs += [0] * 10000 + else: + # get the speaker embedding or speaker id for the reference wav file + reference_speaker_embedding = None + reference_speaker_id = None + if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): + if reference_speaker_name and isinstance(reference_speaker_name, str): + if self.tts_config.use_d_vector_file: + # get the speaker embedding from the saved d_vectors. + reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(reference_speaker_name)[0] + reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :] # [1 x embedding_dim] + else: + # get speaker idx from the speaker name + reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name] + else: + reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(reference_wav) + + outputs = transfer_voice( + model=self.tts_model, + CONFIG=self.tts_config, + use_cuda=self.use_cuda, + reference_wav=reference_wav, + speaker_id=speaker_id, + d_vector=speaker_embedding, + use_griffin_lim=use_gl, + reference_speaker_id=reference_speaker_id, + reference_d_vector=reference_speaker_embedding + ) + waveform = outputs if not use_gl: + mel_postnet_spec = outputs[0].detach().cpu().numpy() # denormalize tts output based on tts audio config mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" @@ -280,18 +355,11 @@ class Synthesizer(object): # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference(vocoder_input.to(device_type)) - if self.use_cuda and not use_gl: + if self.use_cuda: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() - waveform = waveform.squeeze() - - # trim silence - if self.tts_config.audio["do_trim_silence"] is True: - waveform = trim_silence(waveform, self.tts_model.ap) - - wavs += list(waveform) - wavs += [0] * 10000 + wavs = waveform.squeeze() # compute stats process_time = time.time() - start_time From b0be825d9244ab4be1d4610913f74589bc5f1e49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 11 Mar 2022 10:40:20 +0100 Subject: [PATCH 09/38] Update issue template (#1370) * Add bug_report template * Fix typos --- .github/ISSUE_TEMPLATE/bug_report.md | 58 ------------------ .github/ISSUE_TEMPLATE/bug_report.yaml | 85 ++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 58 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yaml diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 133346f6..00000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -name: 🐛 Bug report -about: Create a bug report to help 🐸 improve -title: '[Bug] ' -labels: bug -assignees: '' - ---- - -## 🐛 Description - - - -### To Reproduce - - - -### Expected behavior - - - -### Environment - - - -- 🐸TTS Version (e.g., 1.3.0): -- PyTorch Version (e.g., 1.8) -- Python version: -- OS (e.g., Linux): -- CUDA/cuDNN version: -- GPU models and configuration: -- How you installed PyTorch (`conda`, `pip`, source): -- Any other relevant information: - -### Additional context - - diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml new file mode 100644 index 00000000..34cde7e8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -0,0 +1,85 @@ +name: "🐛 Bug report" +description: Create a bug report to help 🐸 improve +title: '[Bug] ' +labels: [ "bug" ] +body: + - type: markdown + attributes: + value: | + Welcome to the 🐸TTS! Thanks for taking the time to fill out this bug report! + + - type: textarea + id: bug-description + attributes: + label: Describe the bug + description: A clear and concise description of what the bug is. If you intend to submit a PR for this issue, tell us in the description. Thanks! + placeholder: Bug description + validations: + required: true + + - type: textarea + id: reproduction + attributes: + label: To Reproduce + description: | + Please share your code to reproduce the error. + + Issues are fixed faster if you can provide a working example. + + The best place for sharing code is colab. https://colab.research.google.com/ + So we can directly run your code and reproduce the issue. + + In the worse case, provide steps to reproduce the behavior. + + 1. Run the following command '...' + 2. ... + 3. See error + placeholder: Reproduction + validations: + required: true + + - type: textarea + id: expected-behavior + attributes: + label: Expected behavior + description: "Write down what the expected behaviour" + + - type: textarea + id: logs + attributes: + label: Logs + description: "Please include the relevant logs if you can." + render: shell + + - type: textarea + id: system-info + attributes: + label: Environment + description: | + You can either run `TTS/bin/collect_env_info.py` + + ```bash + wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_info.py + python collect_env_info.py + ``` + + or fill in the fields below manually. + render: shell + placeholder: | + - 🐸TTS Version (e.g., 1.3.0): + - PyTorch Version (e.g., 1.8) + - Python version: + - OS (e.g., Linux): + - CUDA/cuDNN version: + - GPU models and configuration: + - How you installed PyTorch (`conda`, `pip`, source): + - Any other relevant information: + validations: + required: true + - type: textarea + id: context + attributes: + label: Additional context + description: Add any other context about the problem here. + validations: + required: false From 36e9ea2f97395bf6e4395557fe5c80260edf62d1 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 11 Mar 2022 06:43:31 -0300 Subject: [PATCH 10/38] Open bible dataset formatter (#1365) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add support for voice conversion inference * Cache d_vectors_by_speaker for fast inference using a bigger speakers.json * Rebase bug fix * Use the average d-vector for inference * Fix the bug in find unique chars script * Add OpenBible formatter Co-authored-by: Eren Gölge --- TTS/bin/find_unique_chars.py | 2 +- TTS/tts/datasets/formatters.py | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 4689dcad..ea169748 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -29,7 +29,7 @@ def main(): items = train_items + eval_items - texts = "".join(item[0] for item in items) + texts = "".join(item["text"] for item in items) chars = set(texts) lower_chars = filter(lambda c: c.islower(), chars) chars_force_lower = [c.lower() for c in chars] diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index ac3080c3..573a5deb 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -348,7 +348,29 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") - items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id}) + items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id}) + return items + + +def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None): + """ToDo: Refer the paper when available""" + items = [] + split_dir = meta_files + meta_files = glob(f"{os.path.join(root_path, split_dir)}/**/*.txt", recursive=True) + for meta_file in meta_files: + _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) + file_id = txt_file.split(".")[0] + # ignore speakers + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: + continue + with open(meta_file, "r", encoding="utf-8") as file_text: + text = file_text.readline().replace("\n", "") + # ignore sentences that contains digits + if ignore_digits_sentences and any(map(str.isdigit, text)): + continue + wav_file = os.path.join(root_path, split_dir, speaker_id, file_id + ".flac") + items.append({"text": text, "audio_file": wav_file, "speaker_name": "OB_" + speaker_id}) return items From f81892483d720688005dab723e990e90a990f8a0 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 11 Mar 2022 10:43:40 -0300 Subject: [PATCH 11/38] REBASED: Transform Speaker Encoder in a Generic Encoder and Implement Emotion Encoder training support (#1349) * Rename Speaker encoder module to encoder * Add a generic emotion dataset formatter * Transform the Speaker Encoder dataset to a generic dataset and create emotion encoder config * Add class map in emotion config * Add Base encoder config * Add evaluation encoder script * Fix the bug in plot_embeddings * Enable Weight decay for encoder training * Add argumnet to disable storage * Add Perfect Sampler and remove storage * Add evaluation during encoder training * Fix lint checks * Remove useless config parameter * Active evaluation in speaker encoder test and use multispeaker dataset for this test * Unit tests fixs * Remove useless tests for speedup the aux_tests * Use get_optimizer in Encoder * Add BaseEncoder Class * Fix the unitests * Add Perfect Batch Sampler unit test * Add compute encoder accuracy in a function --- TTS/bin/compute_embeddings.py | 22 +- TTS/bin/eval_encoder.py | 88 ++ TTS/bin/train_encoder.py | 334 +++--- TTS/config/__init__.py | 2 +- TTS/{speaker_encoder => encoder}/README.md | 0 TTS/{speaker_encoder => encoder}/__init__.py | 0 .../configs/base_encoder_config.py} | 31 +- TTS/encoder/configs/emotion_encoder_config.py | 12 + TTS/encoder/configs/speaker_encoder_config.py | 11 + TTS/encoder/dataset.py | 149 +++ TTS/{speaker_encoder => encoder}/losses.py | 5 + TTS/encoder/models/base_encoder.py | 145 +++ TTS/encoder/models/lstm.py | 99 ++ .../models/resnet.py | 91 +- .../requirements.txt | 0 .../utils/__init__.py | 0 .../utils/generic_utils.py | 52 +- TTS/{speaker_encoder => encoder}/utils/io.py | 0 .../utils/prepare_voxceleb.py | 0 TTS/encoder/utils/samplers.py | 102 ++ .../utils/training.py | 0 .../utils/visual.py | 14 +- TTS/speaker_encoder/configs/config.json | 118 --- .../configs/config_resnet_angleproto.json | 956 ----------------- .../config_resnet_softmax_angleproto.json | 957 ------------------ TTS/speaker_encoder/dataset.py | 243 ----- TTS/speaker_encoder/models/lstm.py | 189 ---- TTS/speaker_encoder/umap.png | Bin 24616 -> 0 bytes TTS/tts/configs/shared_configs.py | 2 +- TTS/tts/datasets/formatters.py | 20 + TTS/tts/utils/speakers.py | 4 +- tests/aux_tests/test_speaker_encoder.py | 6 +- tests/aux_tests/test_speaker_encoder_train.py | 30 +- tests/aux_tests/test_speaker_manager.py | 4 +- tests/data_tests/test_samplers.py | 49 + tests/inputs/test_glow_tts.json | 4 +- tests/inputs/test_speaker_encoder_config.json | 4 +- tests/inputs/test_tacotron2_config.json | 4 +- tests/inputs/test_tacotron_config.json | 4 +- tests/tts_tests/test_vits.py | 2 +- 40 files changed, 962 insertions(+), 2791 deletions(-) create mode 100644 TTS/bin/eval_encoder.py rename TTS/{speaker_encoder => encoder}/README.md (100%) rename TTS/{speaker_encoder => encoder}/__init__.py (100%) rename TTS/{speaker_encoder/speaker_encoder_config.py => encoder/configs/base_encoder_config.py} (66%) create mode 100644 TTS/encoder/configs/emotion_encoder_config.py create mode 100644 TTS/encoder/configs/speaker_encoder_config.py create mode 100644 TTS/encoder/dataset.py rename TTS/{speaker_encoder => encoder}/losses.py (97%) create mode 100644 TTS/encoder/models/base_encoder.py create mode 100644 TTS/encoder/models/lstm.py rename TTS/{speaker_encoder => encoder}/models/resnet.py (67%) rename TTS/{speaker_encoder => encoder}/requirements.txt (100%) rename TTS/{speaker_encoder => encoder}/utils/__init__.py (100%) rename TTS/{speaker_encoder => encoder}/utils/generic_utils.py (80%) rename TTS/{speaker_encoder => encoder}/utils/io.py (100%) rename TTS/{speaker_encoder => encoder}/utils/prepare_voxceleb.py (100%) create mode 100644 TTS/encoder/utils/samplers.py rename TTS/{speaker_encoder => encoder}/utils/training.py (100%) rename TTS/{speaker_encoder => encoder}/utils/visual.py (69%) delete mode 100644 TTS/speaker_encoder/configs/config.json delete mode 100644 TTS/speaker_encoder/configs/config_resnet_angleproto.json delete mode 100644 TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json delete mode 100644 TTS/speaker_encoder/dataset.py delete mode 100644 TTS/speaker_encoder/models/lstm.py delete mode 100644 TTS/speaker_encoder/umap.png diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 50817154..68571fb4 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -42,33 +42,35 @@ c_dataset = load_config(args.config_dataset_path) meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval) wav_files = meta_data_train + meta_data_eval -speaker_manager = SpeakerManager( +encoder_manager = SpeakerManager( encoder_model_path=args.model_path, encoder_config_path=args.config_path, d_vectors_file_path=args.old_file, use_cuda=args.use_cuda, ) +class_name_key = encoder_manager.speaker_encoder_config.class_name_key + # compute speaker embeddings speaker_mapping = {} for idx, wav_file in enumerate(tqdm(wav_files)): - if isinstance(wav_file, list): - speaker_name = wav_file[2] - wav_file = wav_file[1] + if isinstance(wav_file, dict): + class_name = wav_file[class_name_key] + wav_file = wav_file["audio_file"] else: - speaker_name = None + class_name = None wav_file_name = os.path.basename(wav_file) - if args.old_file is not None and wav_file_name in speaker_manager.clip_ids: + if args.old_file is not None and wav_file_name in encoder_manager.clip_ids: # get the embedding from the old file - embedd = speaker_manager.get_d_vector_by_clip(wav_file_name) + embedd = encoder_manager.get_d_vector_by_clip(wav_file_name) else: # extract the embedding - embedd = speaker_manager.compute_d_vector_from_clip(wav_file) + embedd = encoder_manager.compute_d_vector_from_clip(wav_file) # create speaker_mapping if target dataset is defined speaker_mapping[wav_file_name] = {} - speaker_mapping[wav_file_name]["name"] = speaker_name + speaker_mapping[wav_file_name]["name"] = class_name speaker_mapping[wav_file_name]["embedding"] = embedd if speaker_mapping: @@ -81,5 +83,5 @@ if speaker_mapping: os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True) # pylint: disable=W0212 - speaker_manager._save_json(mapping_file_path, speaker_mapping) + encoder_manager._save_json(mapping_file_path, speaker_mapping) print("Speaker embeddings saved at:", mapping_file_path) diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py new file mode 100644 index 00000000..a03bfd82 --- /dev/null +++ b/TTS/bin/eval_encoder.py @@ -0,0 +1,88 @@ +import argparse +import torch +from argparse import RawTextHelpFormatter + +from tqdm import tqdm + +from TTS.config import load_config +from TTS.tts.datasets import load_tts_samples +from TTS.tts.utils.speakers import SpeakerManager + +def compute_encoder_accuracy(dataset_items, encoder_manager): + + class_name_key = encoder_manager.speaker_encoder_config.class_name_key + map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, 'map_classid_to_classname', None) + + class_acc_dict = {} + + # compute embeddings for all wav_files + for item in tqdm(dataset_items): + class_name = item[class_name_key] + wav_file = item["audio_file"] + + # extract the embedding + embedd = encoder_manager.compute_d_vector_from_clip(wav_file) + if encoder_manager.speaker_encoder_criterion is not None and map_classid_to_classname is not None: + embedding = torch.FloatTensor(embedd).unsqueeze(0) + if encoder_manager.use_cuda: + embedding = embedding.cuda() + + class_id = encoder_manager.speaker_encoder_criterion.softmax.inference(embedding).item() + predicted_label = map_classid_to_classname[str(class_id)] + else: + predicted_label = None + + if class_name is not None and predicted_label is not None: + is_equal = int(class_name == predicted_label) + if class_name not in class_acc_dict: + class_acc_dict[class_name] = [is_equal] + else: + class_acc_dict[class_name].append(is_equal) + else: + raise RuntimeError("Error: class_name or/and predicted_label are None") + + acc_avg = 0 + for key, values in class_acc_dict.items(): + acc = sum(values)/len(values) + print("Class", key, "Accuracy:", acc) + acc_avg += acc + + print("Average Accuracy:", acc_avg/len(class_acc_dict)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Compute the accuracy of the encoder.\n\n""" + """ + Example runs: + python TTS/bin/eval_encoder.py emotion_encoder_model.pth.tar emotion_encoder_config.json dataset_config.json + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") + parser.add_argument( + "config_path", + type=str, + help="Path to model config file.", + ) + + parser.add_argument( + "config_dataset_path", + type=str, + help="Path to dataset config file.", + ) + parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) + parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + + args = parser.parse_args() + + c_dataset = load_config(args.config_dataset_path) + + meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval) + items = meta_data_train + meta_data_eval + + enc_manager = SpeakerManager( + encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda + ) + + compute_encoder_accuracy(items, enc_manager) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index b7424698..af3e6ec4 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -10,16 +10,16 @@ import torch from torch.utils.data import DataLoader from trainer.torch import NoamLR -from TTS.speaker_encoder.dataset import SpeakerEncoderDataset -from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss -from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_speaker_encoder_model -from TTS.speaker_encoder.utils.training import init_training -from TTS.speaker_encoder.utils.visual import plot_embeddings +from TTS.encoder.dataset import EncoderDataset +from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_speaker_encoder_model +from TTS.encoder.utils.samplers import PerfectBatchSampler +from TTS.encoder.utils.training import init_training +from TTS.encoder.utils.visual import plot_embeddings from TTS.tts.datasets import load_tts_samples from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.io import load_fsspec -from TTS.utils.radam import RAdam +from TTS.utils.generic_utils import count_parameters, remove_experiment_folder +from TTS.utils.io import copy_model_files +from trainer.trainer_utils import get_optimizer from TTS.utils.training import check_update torch.backends.cudnn.enabled = True @@ -32,164 +32,238 @@ print(" > Number of GPUs: ", num_gpus) def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False): + num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class + num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch + + dataset = EncoderDataset( + c, + ap, + meta_data_eval if is_val else meta_data_train, + voice_len=c.voice_len, + num_utter_per_class=num_utter_per_class, + num_classes_in_batch=num_classes_in_batch, + verbose=verbose, + augmentation_config=c.audio_augmentation if not is_val else None, + use_torch_spec=c.model_params.get("use_torch_spec", False), + ) + # get classes list + classes = dataset.get_class_list() + + sampler = PerfectBatchSampler( + dataset.items, + classes, + batch_size=num_classes_in_batch*num_utter_per_class, # total batch size + num_classes_in_batch=num_classes_in_batch, + num_gpus=1, + shuffle=not is_val, + drop_last=True) + + if len(classes) < num_classes_in_batch: + if is_val: + raise RuntimeError(f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !") + raise RuntimeError(f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !") + + # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal if is_val: - loader = None - else: - dataset = SpeakerEncoderDataset( - ap, - meta_data_eval if is_val else meta_data_train, - voice_len=c.voice_len, - num_utter_per_speaker=c.num_utters_per_speaker, - num_speakers_in_batch=c.num_speakers_in_batch, - skip_speakers=c.skip_speakers, - storage_size=c.storage["storage_size"], - sample_from_storage_p=c.storage["sample_from_storage_p"], - verbose=verbose, - augmentation_config=c.audio_augmentation, - use_torch_spec=c.model_params.get("use_torch_spec", False), - ) + dataset.set_classes(train_classes) - # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=c.num_speakers_in_batch, - shuffle=False, - num_workers=c.num_loader_workers, - collate_fn=dataset.collate_fn, - ) - return loader, dataset.get_num_speakers() + loader = DataLoader( + dataset, + num_workers=c.num_loader_workers, + batch_sampler=sampler, + collate_fn=dataset.collate_fn, + ) + return loader, classes, dataset.get_map_classid_to_classname() -def train(model, optimizer, scheduler, criterion, data_loader, global_step): +def evaluation(model, criterion, data_loader, global_step): + eval_loss = 0 + for _, data in enumerate(data_loader): + with torch.no_grad(): + # setup input data + inputs, labels = data + + # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] + labels = torch.transpose(labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1).reshape(labels.shape) + inputs = torch.transpose(inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) + + # dispatch data to GPU + if use_cuda: + inputs = inputs.cuda(non_blocking=True) + labels = labels.cuda(non_blocking=True) + + # forward pass model + outputs = model(inputs) + + # loss computation + loss = criterion(outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels) + + eval_loss += loss.item() + + eval_avg_loss = eval_loss/len(data_loader) + # save stats + dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss}) + # plot the last batch in the evaluation + figures = { + "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), + } + dashboard_logger.eval_figures(global_step, figures) + return eval_avg_loss + +def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step): model.train() - epoch_time = 0 best_loss = float("inf") - avg_loss = 0 - avg_loss_all = 0 avg_loader_time = 0 end_time = time.time() + for epoch in range(c.epochs): + tot_loss = 0 + epoch_time = 0 + for _, data in enumerate(data_loader): + start_time = time.time() - for _, data in enumerate(data_loader): - start_time = time.time() + # setup input data + inputs, labels = data + # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] + labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape) + inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) + # ToDo: move it to a unit test + # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape) + # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) + # idx = 0 + # for j in range(0, c.num_classes_in_batch, 1): + # for i in range(j, len(labels), c.num_classes_in_batch): + # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])): + # print("Invalid") + # print(labels) + # exit() + # idx += 1 + # labels = labels_converted + # inputs = inputs_converted - # setup input data - inputs, labels = data - loader_time = time.time() - end_time - global_step += 1 + loader_time = time.time() - end_time + global_step += 1 - # setup lr - if c.lr_decay: - scheduler.step() - optimizer.zero_grad() + # setup lr + if c.lr_decay: + scheduler.step() + optimizer.zero_grad() - # dispatch data to GPU - if use_cuda: - inputs = inputs.cuda(non_blocking=True) - labels = labels.cuda(non_blocking=True) + # dispatch data to GPU + if use_cuda: + inputs = inputs.cuda(non_blocking=True) + labels = labels.cuda(non_blocking=True) - # forward pass model - outputs = model(inputs) + # forward pass model + outputs = model(inputs) - # loss computation - loss = criterion(outputs.view(c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1), labels) - loss.backward() - grad_norm, _ = check_update(model, c.grad_clip) - optimizer.step() + # loss computation + loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels) + loss.backward() + grad_norm, _ = check_update(model, c.grad_clip) + optimizer.step() - step_time = time.time() - start_time - epoch_time += step_time + step_time = time.time() - start_time + epoch_time += step_time - # Averaged Loss and Averaged Loader Time - avg_loss = 0.01 * loss.item() + 0.99 * avg_loss if avg_loss != 0 else loss.item() - num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1 - avg_loader_time = ( - 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time - if avg_loader_time != 0 - else loader_time - ) - current_lr = optimizer.param_groups[0]["lr"] + # acumulate the total epoch loss + tot_loss += loss.item() - if global_step % c.steps_plot_stats == 0: - # Plot Training Epoch Stats - train_stats = { - "loss": avg_loss, - "lr": current_lr, - "grad_norm": grad_norm, - "step_time": step_time, - "avg_loader_time": avg_loader_time, - } - dashboard_logger.train_epoch_stats(global_step, train_stats) - figures = { - # FIXME: not constant - "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), 10), - } - dashboard_logger.train_figures(global_step, figures) - - if global_step % c.print_step == 0: - print( - " | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} " - "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format( - global_step, loss.item(), avg_loss, grad_norm, step_time, loader_time, avg_loader_time, current_lr - ), - flush=True, + # Averaged Loader Time + num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1 + avg_loader_time = ( + 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time + if avg_loader_time != 0 + else loader_time ) - avg_loss_all += avg_loss + current_lr = optimizer.param_groups[0]["lr"] - if global_step >= c.max_train_step or global_step % c.save_step == 0: - # save best model only - best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step) - avg_loss_all = 0 - if global_step >= c.max_train_step: - break + if global_step % c.steps_plot_stats == 0: + # Plot Training Epoch Stats + train_stats = { + "loss": loss.item(), + "lr": current_lr, + "grad_norm": grad_norm, + "step_time": step_time, + "avg_loader_time": avg_loader_time, + } + dashboard_logger.train_epoch_stats(global_step, train_stats) + figures = { + "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), + } + dashboard_logger.train_figures(global_step, figures) - end_time = time.time() + if global_step % c.print_step == 0: + print( + " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} " + "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format( + global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr + ), + flush=True, + ) - return avg_loss, global_step + if global_step % c.save_step == 0: + # save model + save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch) + + end_time = time.time() + + print("") + print( + ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} " + "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format( + epoch, tot_loss/len(data_loader), grad_norm, epoch_time, avg_loader_time + ), + flush=True, + ) + # evaluation + if c.run_eval: + model.eval() + eval_loss = evaluation(model, criterion, eval_data_loader, global_step) + print("\n\n") + print("--> EVAL PERFORMANCE") + print( + " | > Epoch:{} AvgLoss: {:.5f} ".format( + epoch, eval_loss + ), + flush=True, + ) + # save the best checkpoint + best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch) + model.train() + + return best_loss, global_step def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval + global train_classes ap = AudioProcessor(**c.audio) model = setup_speaker_encoder_model(c) - optimizer = RAdam(model.parameters(), lr=c.lr) + optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model) # pylint: disable=redefined-outer-name - meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=False) + meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True) - data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True) - - if c.loss == "ge2e": - criterion = GE2ELoss(loss_method="softmax") - elif c.loss == "angleproto": - criterion = AngleProtoLoss() - elif c.loss == "softmaxproto": - criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_speakers) + train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True) + if c.run_eval: + eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True) else: - raise Exception("The %s not is a loss supported" % c.loss) + eval_data_loader = None + + num_classes = len(train_classes) + criterion = model.get_criterion(c, num_classes) + + if c.loss == "softmaxproto" and c.model != "speaker_encoder": + c.map_classid_to_classname = map_classid_to_classname + copy_model_files(c, OUT_PATH) if args.restore_path: - checkpoint = load_fsspec(args.restore_path) - try: - model.load_state_dict(checkpoint["model"]) - - if "criterion" in checkpoint: - criterion.load_state_dict(checkpoint["criterion"]) - - except (KeyError, RuntimeError): - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model.load_state_dict(model_dict) - del model_dict - for group in optimizer.param_groups: - group["lr"] = c.lr - - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] + criterion, args.restore_step = model.load_checkpoint(c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion) + print(" > Model restored from step %d" % args.restore_step, flush=True) else: args.restore_step = 0 @@ -206,7 +280,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion.cuda() global_step = args.restore_step - _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step) + _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step) if __name__ == "__main__": diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index 5c905295..6b0778c5 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -37,7 +37,7 @@ def register_config(model_name: str) -> Coqpit: """ config_class = None config_name = model_name + "_config" - paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"] + paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"] for path in paths: try: config_class = find_module(path, config_name) diff --git a/TTS/speaker_encoder/README.md b/TTS/encoder/README.md similarity index 100% rename from TTS/speaker_encoder/README.md rename to TTS/encoder/README.md diff --git a/TTS/speaker_encoder/__init__.py b/TTS/encoder/__init__.py similarity index 100% rename from TTS/speaker_encoder/__init__.py rename to TTS/encoder/__init__.py diff --git a/TTS/speaker_encoder/speaker_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py similarity index 66% rename from TTS/speaker_encoder/speaker_encoder_config.py rename to TTS/encoder/configs/base_encoder_config.py index 8212acc7..02b88d66 100644 --- a/TTS/speaker_encoder/speaker_encoder_config.py +++ b/TTS/encoder/configs/base_encoder_config.py @@ -7,10 +7,10 @@ from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTr @dataclass -class SpeakerEncoderConfig(BaseTrainingConfig): - """Defines parameters for Speaker Encoder model.""" +class BaseEncoderConfig(BaseTrainingConfig): + """Defines parameters for a Generic Encoder model.""" - model: str = "speaker_encoder" + model: str = None audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # model params @@ -27,34 +27,33 @@ class SpeakerEncoderConfig(BaseTrainingConfig): audio_augmentation: Dict = field(default_factory=lambda: {}) - storage: Dict = field( - default_factory=lambda: { - "sample_from_storage_p": 0.66, # the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 15, # the size of the in-memory storage with respect to a single batch - } - ) - # training params - max_train_step: int = 1000000 # end training when number of training steps reaches this value. + epochs: int = 10000 loss: str = "angleproto" grad_clip: float = 3.0 lr: float = 0.0001 + optimizer: str = "radam" + optimizer_params: Dict = field(default_factory=lambda: { + "betas": [0.9, 0.999], + "weight_decay": 0 + }) lr_decay: bool = False warmup_steps: int = 4000 - wd: float = 1e-6 # logging params tb_model_param_stats: bool = False steps_plot_stats: int = 10 - checkpoint: bool = True save_step: int = 1000 print_step: int = 20 + run_eval: bool = False # data loader - num_speakers_in_batch: int = MISSING - num_utters_per_speaker: int = MISSING + num_classes_in_batch: int = MISSING + num_utter_per_class: int = MISSING + eval_num_classes_in_batch: int = None + eval_num_utter_per_class: int = None + num_loader_workers: int = MISSING - skip_speakers: bool = False voice_len: float = 1.6 def check_values(self): diff --git a/TTS/encoder/configs/emotion_encoder_config.py b/TTS/encoder/configs/emotion_encoder_config.py new file mode 100644 index 00000000..5eda2671 --- /dev/null +++ b/TTS/encoder/configs/emotion_encoder_config.py @@ -0,0 +1,12 @@ +from dataclasses import asdict, dataclass + +from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig + + +@dataclass +class EmotionEncoderConfig(BaseEncoderConfig): + """Defines parameters for Emotion Encoder model.""" + + model: str = "emotion_encoder" + map_classid_to_classname: dict = None + class_name_key: str = "emotion_name" diff --git a/TTS/encoder/configs/speaker_encoder_config.py b/TTS/encoder/configs/speaker_encoder_config.py new file mode 100644 index 00000000..6dceb002 --- /dev/null +++ b/TTS/encoder/configs/speaker_encoder_config.py @@ -0,0 +1,11 @@ +from dataclasses import asdict, dataclass + +from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig + + +@dataclass +class SpeakerEncoderConfig(BaseEncoderConfig): + """Defines parameters for Speaker Encoder model.""" + + model: str = "speaker_encoder" + class_name_key: str = "speaker_name" diff --git a/TTS/encoder/dataset.py b/TTS/encoder/dataset.py new file mode 100644 index 00000000..a4db021b --- /dev/null +++ b/TTS/encoder/dataset.py @@ -0,0 +1,149 @@ +import random + +import torch +from torch.utils.data import Dataset + +from TTS.encoder.utils.generic_utils import AugmentWAV + +class EncoderDataset(Dataset): + def __init__( + self, + config, + ap, + meta_data, + voice_len=1.6, + num_classes_in_batch=64, + num_utter_per_class=10, + verbose=False, + augmentation_config=None, + use_torch_spec=None, + ): + """ + Args: + ap (TTS.tts.utils.AudioProcessor): audio processor object. + meta_data (list): list of dataset instances. + seq_len (int): voice segment length in seconds. + verbose (bool): print diagnostic information. + """ + super().__init__() + self.config = config + self.items = meta_data + self.sample_rate = ap.sample_rate + self.seq_len = int(voice_len * self.sample_rate) + self.num_utter_per_class = num_utter_per_class + self.ap = ap + self.verbose = verbose + self.use_torch_spec = use_torch_spec + self.classes, self.items = self.__parse_items() + + self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} + + # Data Augmentation + self.augmentator = None + self.gaussian_augmentation_config = None + if augmentation_config: + self.data_augmentation_p = augmentation_config["p"] + if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config): + self.augmentator = AugmentWAV(ap, augmentation_config) + + if "gaussian" in augmentation_config.keys(): + self.gaussian_augmentation_config = augmentation_config["gaussian"] + + if self.verbose: + print("\n > DataLoader initialization") + print(f" | > Classes per Batch: {num_classes_in_batch}") + print(f" | > Number of instances : {len(self.items)}") + print(f" | > Sequence length: {self.seq_len}") + print(f" | > Num Classes: {len(self.classes)}") + print(f" | > Classes: {self.classes}") + + + def load_wav(self, filename): + audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) + return audio + + def __parse_items(self): + class_to_utters = {} + for item in self.items: + path_ = item["audio_file"] + class_name = item[self.config.class_name_key] + if class_name in class_to_utters.keys(): + class_to_utters[class_name].append(path_) + else: + class_to_utters[class_name] = [ + path_, + ] + + # skip classes with number of samples >= self.num_utter_per_class + class_to_utters = { + k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class + } + + classes = list(class_to_utters.keys()) + classes.sort() + + new_items = [] + for item in self.items: + path_ = item["audio_file"] + class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"] + # ignore filtered classes + if class_name not in classes: + continue + # ignore small audios + if self.load_wav(path_).shape[0] - self.seq_len <= 0: + continue + + new_items.append({"wav_file_path": path_, "class_name": class_name}) + + return classes, new_items + + def __len__(self): + return len(self.items) + + def get_num_classes(self): + return len(self.classes) + + def get_class_list(self): + return self.classes + def set_classes(self, classes): + self.classes = classes + self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} + + + def get_map_classid_to_classname(self): + return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items()) + + def __getitem__(self, idx): + return self.items[idx] + + def collate_fn(self, batch): + # get the batch class_ids + labels = [] + feats = [] + for item in batch: + utter_path = item["wav_file_path"] + class_name = item["class_name"] + + # get classid + class_id = self.classname_to_classid[class_name] + # load wav file + wav = self.load_wav(utter_path) + offset = random.randint(0, wav.shape[0] - self.seq_len) + wav = wav[offset : offset + self.seq_len] + + if self.augmentator is not None and self.data_augmentation_p: + if random.random() < self.data_augmentation_p: + wav = self.augmentator.apply_one(wav) + + if not self.use_torch_spec: + mel = self.ap.melspectrogram(wav) + feats.append(torch.FloatTensor(mel)) + else: + feats.append(torch.FloatTensor(wav)) + + labels.append(class_id) + + feats = torch.stack(feats) + labels = torch.LongTensor(labels) + + return feats, labels diff --git a/TTS/speaker_encoder/losses.py b/TTS/encoder/losses.py similarity index 97% rename from TTS/speaker_encoder/losses.py rename to TTS/encoder/losses.py index 8ba917b7..de65d8d6 100644 --- a/TTS/speaker_encoder/losses.py +++ b/TTS/encoder/losses.py @@ -189,6 +189,11 @@ class SoftmaxLoss(nn.Module): return L + def inference(self, embedding): + x = self.fc(embedding) + activations = torch.nn.functional.softmax(x, dim=1).squeeze(0) + class_id = torch.argmax(activations) + return class_id class SoftmaxAngleProtoLoss(nn.Module): """ diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py new file mode 100644 index 00000000..c35c636d --- /dev/null +++ b/TTS/encoder/models/base_encoder.py @@ -0,0 +1,145 @@ +import torch +import torchaudio +import numpy as np +from torch import nn + +from TTS.utils.io import load_fsspec +from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss +from TTS.utils.generic_utils import set_init_dict +from coqpit import Coqpit + +class PreEmphasis(nn.Module): + def __init__(self, coefficient=0.97): + super().__init__() + self.coefficient = coefficient + self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0)) + + def forward(self, x): + assert len(x.size()) == 2 + + x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") + return torch.nn.functional.conv1d(x, self.filter).squeeze(1) + +class BaseEncoder(nn.Module): + """Base `encoder` class. Every new `encoder` model must inherit this. + + It defines common `encoder` specific functions. + """ + + # pylint: disable=W0102 + def __init__(self): + super(BaseEncoder, self).__init__() + + def get_torch_mel_spectrogram_class(self, audio_config): + return torch.nn.Sequential( + PreEmphasis(audio_config["preemphasis"]), + # TorchSTFT( + # n_fft=audio_config["fft_size"], + # hop_length=audio_config["hop_length"], + # win_length=audio_config["win_length"], + # sample_rate=audio_config["sample_rate"], + # window="hamming_window", + # mel_fmin=0.0, + # mel_fmax=None, + # use_htk=True, + # do_amp_to_db=False, + # n_mels=audio_config["num_mels"], + # power=2.0, + # use_mel=True, + # mel_norm=None, + # ) + torchaudio.transforms.MelSpectrogram( + sample_rate=audio_config["sample_rate"], + n_fft=audio_config["fft_size"], + win_length=audio_config["win_length"], + hop_length=audio_config["hop_length"], + window_fn=torch.hamming_window, + n_mels=audio_config["num_mels"], + ) + ) + + @torch.no_grad() + def inference(self, x, l2_norm=True): + return self.forward(x, l2_norm) + + @torch.no_grad() + def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True): + """ + Generate embeddings for a batch of utterances + x: 1xTxD + """ + # map to the waveform size + if self.use_torch_spec: + num_frames = num_frames * self.audio_config["hop_length"] + + max_len = x.shape[1] + + if max_len < num_frames: + num_frames = max_len + + offsets = np.linspace(0, max_len - num_frames, num=num_eval) + + frames_batch = [] + for offset in offsets: + offset = int(offset) + end_offset = int(offset + num_frames) + frames = x[:, offset:end_offset] + frames_batch.append(frames) + + frames_batch = torch.cat(frames_batch, dim=0) + embeddings = self.inference(frames_batch, l2_norm=l2_norm) + + if return_mean: + embeddings = torch.mean(embeddings, dim=0, keepdim=True) + return embeddings + + def get_criterion(self, c: Coqpit, num_classes=None): + if c.loss == "ge2e": + criterion = GE2ELoss(loss_method="softmax") + elif c.loss == "angleproto": + criterion = AngleProtoLoss() + elif c.loss == "softmaxproto": + criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes) + else: + raise Exception("The %s not is a loss supported" % c.loss) + return criterion + + def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None): + state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) + try: + self.load_state_dict(state["model"]) + except (KeyError, RuntimeError) as error: + # If eval raise the error + if eval: + raise error + + print(" > Partial model initialization.") + model_dict = self.state_dict() + model_dict = set_init_dict(model_dict, state["model"], c) + self.load_state_dict(model_dict) + del model_dict + + # load the criterion for restore_path + if criterion is not None and "criterion" in state: + try: + criterion.load_state_dict(state["criterion"]) + except (KeyError, RuntimeError) as error: + print(" > Criterion load ignored because of:", error) + + # instance and load the criterion for the encoder classifier in inference time + if eval and criterion is None and "criterion" in state and getattr(config, 'map_classid_to_classname', None) is not None: + criterion = self.get_criterion(config, len(config.map_classid_to_classname)) + criterion.load_state_dict(state["criterion"]) + + if use_cuda: + self.cuda() + if criterion is not None: + criterion = criterion.cuda() + + if eval: + self.eval() + assert not self.training + + if not eval: + return criterion, state["step"] + return criterion diff --git a/TTS/encoder/models/lstm.py b/TTS/encoder/models/lstm.py new file mode 100644 index 00000000..51852b5b --- /dev/null +++ b/TTS/encoder/models/lstm.py @@ -0,0 +1,99 @@ +import torch +from torch import nn + +from TTS.encoder.models.base_encoder import BaseEncoder + + +class LSTMWithProjection(nn.Module): + def __init__(self, input_size, hidden_size, proj_size): + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.proj_size = proj_size + self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) + self.linear = nn.Linear(hidden_size, proj_size, bias=False) + + def forward(self, x): + self.lstm.flatten_parameters() + o, (_, _) = self.lstm(x) + return self.linear(o) + + +class LSTMWithoutProjection(nn.Module): + def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): + super().__init__() + self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True) + self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) + self.relu = nn.ReLU() + + def forward(self, x): + _, (hidden, _) = self.lstm(x) + return self.relu(self.linear(hidden[-1])) + + +class LSTMSpeakerEncoder(BaseEncoder): + def __init__( + self, + input_dim, + proj_dim=256, + lstm_dim=768, + num_lstm_layers=3, + use_lstm_with_projection=True, + use_torch_spec=False, + audio_config=None, + ): + super().__init__() + self.use_lstm_with_projection = use_lstm_with_projection + self.use_torch_spec = use_torch_spec + self.audio_config = audio_config + self.proj_dim = proj_dim + + layers = [] + # choise LSTM layer + if use_lstm_with_projection: + layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) + for _ in range(num_lstm_layers - 1): + layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) + self.layers = nn.Sequential(*layers) + else: + self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) + + self.instancenorm = nn.InstanceNorm1d(input_dim) + + if self.use_torch_spec: + self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config) + else: + self.torch_spec = None + + self._init_layers() + + def _init_layers(self): + for name, param in self.layers.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0.0) + elif "weight" in name: + nn.init.xavier_normal_(param) + + def forward(self, x, l2_norm=True): + """Forward pass of the model. + + Args: + x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` + to compute the spectrogram on-the-fly. + l2_norm (bool): Whether to L2-normalize the outputs. + + Shapes: + - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` + """ + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=False): + if self.use_torch_spec: + x.squeeze_(1) + x = self.torch_spec(x) + x = self.instancenorm(x).transpose(1, 2) + d = self.layers(x) + if self.use_lstm_with_projection: + d = d[:, -1] + if l2_norm: + d = torch.nn.functional.normalize(d, p=2, dim=1) + return d diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/encoder/models/resnet.py similarity index 67% rename from TTS/speaker_encoder/models/resnet.py rename to TTS/encoder/models/resnet.py index a799fc52..c4ba9537 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/encoder/models/resnet.py @@ -1,24 +1,8 @@ -import numpy as np import torch -import torchaudio from torch import nn # from TTS.utils.audio import TorchSTFT -from TTS.utils.io import load_fsspec - - -class PreEmphasis(nn.Module): - def __init__(self, coefficient=0.97): - super().__init__() - self.coefficient = coefficient - self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0)) - - def forward(self, x): - assert len(x.size()) == 2 - - x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") - return torch.nn.functional.conv1d(x, self.filter).squeeze(1) - +from TTS.encoder.models.base_encoder import BaseEncoder class SELayer(nn.Module): def __init__(self, channel, reduction=8): @@ -71,7 +55,7 @@ class SEBasicBlock(nn.Module): return out -class ResNetSpeakerEncoder(nn.Module): +class ResNetSpeakerEncoder(BaseEncoder): """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153 Adapted from: https://github.com/clovaai/voxceleb_trainer """ @@ -110,32 +94,7 @@ class ResNetSpeakerEncoder(nn.Module): self.instancenorm = nn.InstanceNorm1d(input_dim) if self.use_torch_spec: - self.torch_spec = torch.nn.Sequential( - PreEmphasis(audio_config["preemphasis"]), - # TorchSTFT( - # n_fft=audio_config["fft_size"], - # hop_length=audio_config["hop_length"], - # win_length=audio_config["win_length"], - # sample_rate=audio_config["sample_rate"], - # window="hamming_window", - # mel_fmin=0.0, - # mel_fmax=None, - # use_htk=True, - # do_amp_to_db=False, - # n_mels=audio_config["num_mels"], - # power=2.0, - # use_mel=True, - # mel_norm=None, - # ) - torchaudio.transforms.MelSpectrogram( - sample_rate=audio_config["sample_rate"], - n_fft=audio_config["fft_size"], - win_length=audio_config["win_length"], - hop_length=audio_config["hop_length"], - window_fn=torch.hamming_window, - n_mels=audio_config["num_mels"], - ), - ) + self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config) else: self.torch_spec = None @@ -238,47 +197,3 @@ class ResNetSpeakerEncoder(nn.Module): if l2_norm: x = torch.nn.functional.normalize(x, p=2, dim=1) return x - - @torch.no_grad() - def inference(self, x, l2_norm=False): - return self.forward(x, l2_norm) - - @torch.no_grad() - def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True): - """ - Generate embeddings for a batch of utterances - x: 1xTxD - """ - # map to the waveform size - if self.use_torch_spec: - num_frames = num_frames * self.audio_config["hop_length"] - - max_len = x.shape[1] - - if max_len < num_frames: - num_frames = max_len - - offsets = np.linspace(0, max_len - num_frames, num=num_eval) - - frames_batch = [] - for offset in offsets: - offset = int(offset) - end_offset = int(offset + num_frames) - frames = x[:, offset:end_offset] - frames_batch.append(frames) - - frames_batch = torch.cat(frames_batch, dim=0) - embeddings = self.inference(frames_batch, l2_norm=l2_norm) - - if return_mean: - embeddings = torch.mean(embeddings, dim=0, keepdim=True) - return embeddings - - def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): - state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) - self.load_state_dict(state["model"]) - if use_cuda: - self.cuda() - if eval: - self.eval() - assert not self.training diff --git a/TTS/speaker_encoder/requirements.txt b/TTS/encoder/requirements.txt similarity index 100% rename from TTS/speaker_encoder/requirements.txt rename to TTS/encoder/requirements.txt diff --git a/TTS/speaker_encoder/utils/__init__.py b/TTS/encoder/utils/__init__.py similarity index 100% rename from TTS/speaker_encoder/utils/__init__.py rename to TTS/encoder/utils/__init__.py diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py similarity index 80% rename from TTS/speaker_encoder/utils/generic_utils.py rename to TTS/encoder/utils/generic_utils.py index 4ab4e923..17f1c3d9 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -3,60 +3,15 @@ import glob import os import random import re -from multiprocessing import Manager import numpy as np from scipy import signal -from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder -from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder +from TTS.encoder.models.lstm import LSTMSpeakerEncoder +from TTS.encoder.models.resnet import ResNetSpeakerEncoder from TTS.utils.io import save_fsspec -class Storage(object): - def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8): - # use multiprocessing for threading safe - self.storage = Manager().list() - self.maxsize = maxsize - self.num_speakers_in_batch = num_speakers_in_batch - self.num_threads = num_threads - self.ignore_last_batch = False - - if storage_batchs >= 3: - self.ignore_last_batch = True - - # used for fast random sample - self.safe_storage_size = self.maxsize - self.num_threads - if self.ignore_last_batch: - self.safe_storage_size -= self.num_speakers_in_batch - - def __len__(self): - return len(self.storage) - - def full(self): - return len(self.storage) >= self.maxsize - - def append(self, item): - # if storage is full, remove an item - if self.full(): - self.storage.pop(0) - - self.storage.append(item) - - def get_random_sample(self): - # safe storage size considering all threads remove one item from storage in same time - storage_size = len(self.storage) - self.num_threads - - if self.ignore_last_batch: - storage_size -= self.num_speakers_in_batch - - return self.storage[random.randint(0, storage_size)] - - def get_random_sample_fast(self): - """Call this method only when storage is full""" - return self.storage[random.randint(0, self.safe_storage_size)] - - class AugmentWAV(object): def __init__(self, ap, augmentation_config): @@ -209,7 +164,7 @@ def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_s save_fsspec(state, checkpoint_path) -def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step): +def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch): if model_loss < best_loss: new_state_dict = model.state_dict() state = { @@ -217,6 +172,7 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path "optimizer": optimizer.state_dict(), "criterion": criterion.state_dict(), "step": current_step, + "epoch": epoch, "loss": model_loss, "date": datetime.date.today().strftime("%B %d, %Y"), } diff --git a/TTS/speaker_encoder/utils/io.py b/TTS/encoder/utils/io.py similarity index 100% rename from TTS/speaker_encoder/utils/io.py rename to TTS/encoder/utils/io.py diff --git a/TTS/speaker_encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py similarity index 100% rename from TTS/speaker_encoder/utils/prepare_voxceleb.py rename to TTS/encoder/utils/prepare_voxceleb.py diff --git a/TTS/encoder/utils/samplers.py b/TTS/encoder/utils/samplers.py new file mode 100644 index 00000000..947f5da0 --- /dev/null +++ b/TTS/encoder/utils/samplers.py @@ -0,0 +1,102 @@ +import random +from torch.utils.data.sampler import Sampler, SubsetRandomSampler + + +class SubsetSampler(Sampler): + """ + Samples elements sequentially from a given list of indices. + + Args: + indices (list): a sequence of indices + """ + + def __init__(self, indices): + super().__init__(indices) + self.indices = indices + + def __iter__(self): + return (self.indices[i] for i in range(len(self.indices))) + + def __len__(self): + return len(self.indices) + + +class PerfectBatchSampler(Sampler): + """ + Samples a mini-batch of indices for a balanced class batching + + Args: + dataset_items(list): dataset items to sample from. + classes (list): list of classes of dataset_items to sample from. + batch_size (int): total number of samples to be sampled in a mini-batch. + num_gpus (int): number of GPU in the data parallel mode. + shuffle (bool): if True, samples randomly, otherwise samples sequentially. + drop_last (bool): if True, drops last incomplete batch. + """ + + def __init__(self, dataset_items, classes, batch_size, num_classes_in_batch, num_gpus=1, shuffle=True, drop_last=False, label_key="class_name"): + super().__init__(dataset_items) + assert batch_size % (num_classes_in_batch * num_gpus) == 0, ( + 'Batch size must be divisible by number of classes times the number of data parallel devices (if enabled).') + + label_indices = {} + for idx, item in enumerate(dataset_items): + label = item[label_key] + if label not in label_indices.keys(): + label_indices[label] = [idx] + else: + label_indices[label].append(idx) + + if shuffle: + self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes] + else: + self._samplers = [SubsetSampler(label_indices[key]) for key in classes] + + self._batch_size = batch_size + self._drop_last = drop_last + self._dp_devices = num_gpus + self._num_classes_in_batch = num_classes_in_batch + + def __iter__(self): + + batch = [] + if self._num_classes_in_batch != len(self._samplers): + valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch) + else: + valid_samplers_idx = None + + iters = [iter(s) for s in self._samplers] + done = False + + while True: + b = [] + for i, it in enumerate(iters): + if valid_samplers_idx is not None and i not in valid_samplers_idx: + continue + idx = next(it, None) + if idx is None: + done = True + break + b.append(idx) + if done: + break + batch += b + if len(batch) == self._batch_size: + yield batch + batch = [] + if valid_samplers_idx is not None: + valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch) + + if not self._drop_last: + if len(batch) > 0: + groups = len(batch) // self._num_classes_in_batch + if groups % self._dp_devices == 0: + yield batch + else: + batch = batch[:(groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch] + if len(batch) > 0: + yield batch + + def __len__(self): + class_batch_size = self._batch_size // self._num_classes_in_batch + return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers) diff --git a/TTS/speaker_encoder/utils/training.py b/TTS/encoder/utils/training.py similarity index 100% rename from TTS/speaker_encoder/utils/training.py rename to TTS/encoder/utils/training.py diff --git a/TTS/speaker_encoder/utils/visual.py b/TTS/encoder/utils/visual.py similarity index 69% rename from TTS/speaker_encoder/utils/visual.py rename to TTS/encoder/utils/visual.py index 4f40f68c..f2db2f3f 100644 --- a/TTS/speaker_encoder/utils/visual.py +++ b/TTS/encoder/utils/visual.py @@ -29,14 +29,18 @@ colormap = ( ) -def plot_embeddings(embeddings, num_utter_per_speaker): - embeddings = embeddings[: 10 * num_utter_per_speaker] +def plot_embeddings(embeddings, num_classes_in_batch): + num_utter_per_class = embeddings.shape[0] // num_classes_in_batch + + # if necessary get just the first 10 classes + if num_classes_in_batch > 10: + num_classes_in_batch = 10 + embeddings = embeddings[: num_classes_in_batch * num_utter_per_class] + model = umap.UMAP() projection = model.fit_transform(embeddings) - num_speakers = embeddings.shape[0] // num_utter_per_speaker - ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker) + ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class) colors = [colormap[i] for i in ground_truth] - fig, ax = plt.subplots(figsize=(16, 10)) _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) plt.gca().set_aspect("equal", "datalim") diff --git a/TTS/speaker_encoder/configs/config.json b/TTS/speaker_encoder/configs/config.json deleted file mode 100644 index 30d83e51..00000000 --- a/TTS/speaker_encoder/configs/config.json +++ /dev/null @@ -1,118 +0,0 @@ - -{ - "model_name": "lstm", - "run_name": "mueller91", - "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", - "audio":{ - // Audio processing parameters - "num_mels": 40, // size of the mel spec frame. - "fft_size": 400, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 400, // stft window length in ms. - "hop_length": 160, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "num_utters_per_speaker": 10, // - "skip_speakers": false, // skip speakers with samples less than "num_utters_per_speaker" - - "voice_len": 1.6, // number of seconds for each training instance - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 20, // Number of steps to log traning on console. - "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "model": { - "input_dim": 40, - "proj_dim": 256, - "lstm_dim": 768, - "num_lstm_layers": 3, - "use_lstm_with_projection": true - }, - - "audio_augmentation": { - "p": 0, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 1, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 15, // the size of the in-memory storage with respect to a single batch - "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness - }, - "datasets": - [ - { - "name": "vctk_slim", - "path": "../../../audio-datasets/en/VCTK-Corpus/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../../audio-datasets/en/LibriTTS/train-clean-100", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../../audio-datasets/en/LibriTTS/train-clean-360", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../../audio-datasets/en/LibriTTS/train-other-500", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb1", - "path": "../../../audio-datasets/en/voxceleb1/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb2", - "path": "../../../audio-datasets/en/voxceleb2/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "common_voice", - "path": "../../../audio-datasets/en/MozillaCommonVoice", - "meta_file_train": "train.tsv", - "meta_file_val": "test.tsv" - } - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_resnet_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_angleproto.json deleted file mode 100644 index c26d29ce..00000000 --- a/TTS/speaker_encoder/configs/config_resnet_angleproto.json +++ /dev/null @@ -1,956 +0,0 @@ -{ - "model": "speaker_encoder", - "run_name": "speaker_encoder", - "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", - // AUDIO PARAMETERS - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - "stft_pad_mode": "reflect", - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss - "grad_clip": 3.0, // upper limit for gradients for clipping. - "max_train_step": 1000000, // total number of steps to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 100, // number of steps to plot embeddings. - - // Speakers config - "num_speakers_in_batch": 200, // Batch size for training. - "num_utters_per_speaker": 2, // - "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - "voice_len": 2, // number of seconds for each training instance - - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. - "print_step": 50, // Number of steps to log traning on console. - "output_path": "../checkpoints/speaker_encoder/angleproto/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto/", // DATASET-RELATED: output path for all training outputs. - - "audio_augmentation": { - "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation - "rir":{ - "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", - "conv_mode": "full" - }, - "additive":{ - "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", - // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored - "speech":{ - "min_snr_in_db": 13, - "max_snr_in_db": 20, - "min_num_noises": 2, - "max_num_noises": 3 - }, - "noise":{ - "min_snr_in_db": 0, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - }, - "music":{ - "min_snr_in_db": 5, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - } - }, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 0.5, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "model_params": { - "model_name": "resnet", - "input_dim": 80, - "proj_dim": 512 - }, - "storage": { - "sample_from_storage_p": 0.5, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 35 // the size of the in-memory storage with respect to a single batch - }, - "datasets": - [ - { - "name": "voxceleb2", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb1", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - } - - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json deleted file mode 100644 index ccbd751a..00000000 --- a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json +++ /dev/null @@ -1,957 +0,0 @@ - -{ - "model": "speaker_encoder", - "run_name": "speaker_encoder", - "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", - // AUDIO PARAMETERS - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - "stft_pad_mode": "reflect", - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - - "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss - "grad_clip": 3.0, // upper limit for gradients for clipping. - "max_train_step": 1000000, // total number of steps to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 100, // number of steps to plot embeddings. - - // Speakers config - "num_speakers_in_batch": 200, // Batch size for training. - "num_utters_per_speaker": 2, // - "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - "voice_len": 2, // number of seconds for each training instance - - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. - "print_step": 50, // Number of steps to log traning on console. - "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all/", // DATASET-RELATED: output path for all training outputs. - - "audio_augmentation": { - "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation - "rir":{ - "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", - "conv_mode": "full" - }, - "additive":{ - "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", - // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored - "speech":{ - "min_snr_in_db": 13, - "max_snr_in_db": 20, - "min_num_noises": 2, - "max_num_noises": 3 - }, - "noise":{ - "min_snr_in_db": 0, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - }, - "music":{ - "min_snr_in_db": 5, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - } - }, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 0.5, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "model_params": { - "model_name": "resnet", - "input_dim": 80, - "proj_dim": 512 - }, - "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 35 // the size of the in-memory storage with respect to a single batch - }, - "datasets": - [ - { - "name": "voxceleb2", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb1", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - } - - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py deleted file mode 100644 index 07fa9246..00000000 --- a/TTS/speaker_encoder/dataset.py +++ /dev/null @@ -1,243 +0,0 @@ -import random - -import numpy as np -import torch -from torch.utils.data import Dataset - -from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage - - -class SpeakerEncoderDataset(Dataset): - def __init__( - self, - ap, - meta_data, - voice_len=1.6, - num_speakers_in_batch=64, - storage_size=1, - sample_from_storage_p=0.5, - num_utter_per_speaker=10, - skip_speakers=False, - verbose=False, - augmentation_config=None, - use_torch_spec=None, - ): - """ - Args: - ap (TTS.tts.utils.AudioProcessor): audio processor object. - meta_data (list): list of dataset instances. - seq_len (int): voice segment length in seconds. - verbose (bool): print diagnostic information. - """ - super().__init__() - self.items = meta_data - self.sample_rate = ap.sample_rate - self.seq_len = int(voice_len * self.sample_rate) - self.num_speakers_in_batch = num_speakers_in_batch - self.num_utter_per_speaker = num_utter_per_speaker - self.skip_speakers = skip_speakers - self.ap = ap - self.verbose = verbose - self.use_torch_spec = use_torch_spec - self.__parse_items() - storage_max_size = storage_size * num_speakers_in_batch - self.storage = Storage( - maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch - ) - self.sample_from_storage_p = float(sample_from_storage_p) - - speakers_aux = list(self.speakers) - speakers_aux.sort() - self.speakerid_to_classid = {key: i for i, key in enumerate(speakers_aux)} - - # Augmentation - self.augmentator = None - self.gaussian_augmentation_config = None - if augmentation_config: - self.data_augmentation_p = augmentation_config["p"] - if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config): - self.augmentator = AugmentWAV(ap, augmentation_config) - - if "gaussian" in augmentation_config.keys(): - self.gaussian_augmentation_config = augmentation_config["gaussian"] - - if self.verbose: - print("\n > DataLoader initialization") - print(f" | > Speakers per Batch: {num_speakers_in_batch}") - print(f" | > Storage Size: {storage_max_size} instances, each with {num_utter_per_speaker} utters") - print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}") - print(f" | > Number of instances : {len(self.items)}") - print(f" | > Sequence length: {self.seq_len}") - print(f" | > Num speakers: {len(self.speakers)}") - - def load_wav(self, filename): - audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) - return audio - - def __parse_items(self): - self.speaker_to_utters = {} - for i in self.items: - path_ = i["audio_file"] - speaker_ = i["speaker_name"] - if speaker_ in self.speaker_to_utters.keys(): - self.speaker_to_utters[speaker_].append(path_) - else: - self.speaker_to_utters[speaker_] = [ - path_, - ] - - if self.skip_speakers: - self.speaker_to_utters = { - k: v for (k, v) in self.speaker_to_utters.items() if len(v) >= self.num_utter_per_speaker - } - - self.speakers = [k for (k, v) in self.speaker_to_utters.items()] - - def __len__(self): - return int(1e10) - - def get_num_speakers(self): - return len(self.speakers) - - def __sample_speaker(self, ignore_speakers=None): - speaker = random.sample(self.speakers, 1)[0] - # if list of speakers_id is provide make sure that it's will be ignored - if ignore_speakers and self.speakerid_to_classid[speaker] in ignore_speakers: - while True: - speaker = random.sample(self.speakers, 1)[0] - if self.speakerid_to_classid[speaker] not in ignore_speakers: - break - - if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]): - utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker) - else: - utters = random.sample(self.speaker_to_utters[speaker], self.num_utter_per_speaker) - return speaker, utters - - def __sample_speaker_utterances(self, speaker): - """ - Sample all M utterances for the given speaker. - """ - wavs = [] - labels = [] - for _ in range(self.num_utter_per_speaker): - # TODO:dummy but works - while True: - # remove speakers that have num_utter less than 2 - if len(self.speaker_to_utters[speaker]) > 1: - utter = random.sample(self.speaker_to_utters[speaker], 1)[0] - else: - if speaker in self.speakers: - self.speakers.remove(speaker) - - speaker, _ = self.__sample_speaker() - continue - - wav = self.load_wav(utter) - if wav.shape[0] - self.seq_len > 0: - break - - if utter in self.speaker_to_utters[speaker]: - self.speaker_to_utters[speaker].remove(utter) - - if self.augmentator is not None and self.data_augmentation_p: - if random.random() < self.data_augmentation_p: - wav = self.augmentator.apply_one(wav) - - wavs.append(wav) - labels.append(self.speakerid_to_classid[speaker]) - return wavs, labels - - def __getitem__(self, idx): - speaker, _ = self.__sample_speaker() - speaker_id = self.speakerid_to_classid[speaker] - return speaker, speaker_id - - def __load_from_disk_and_storage(self, speaker): - # don't sample from storage, but from HDD - wavs_, labels_ = self.__sample_speaker_utterances(speaker) - # put the newly loaded item into storage - self.storage.append((wavs_, labels_)) - return wavs_, labels_ - - def collate_fn(self, batch): - # get the batch speaker_ids - batch = np.array(batch) - speakers_id_in_batch = set(batch[:, 1].astype(np.int32)) - - labels = [] - feats = [] - speakers = set() - - for speaker, speaker_id in batch: - speaker_id = int(speaker_id) - - # ensure that an speaker appears only once in the batch - if speaker_id in speakers: - - # remove current speaker - if speaker_id in speakers_id_in_batch: - speakers_id_in_batch.remove(speaker_id) - - speaker, _ = self.__sample_speaker(ignore_speakers=speakers_id_in_batch) - speaker_id = self.speakerid_to_classid[speaker] - speakers_id_in_batch.add(speaker_id) - - if random.random() < self.sample_from_storage_p and self.storage.full(): - # sample from storage (if full) - wavs_, labels_ = self.storage.get_random_sample_fast() - - # force choose the current speaker or other not in batch - # It's necessary for ideal training with AngleProto and GE2E losses - if labels_[0] in speakers_id_in_batch and labels_[0] != speaker_id: - attempts = 0 - while True: - wavs_, labels_ = self.storage.get_random_sample_fast() - if labels_[0] == speaker_id or labels_[0] not in speakers_id_in_batch: - break - - attempts += 1 - # Try 5 times after that load from disk - if attempts >= 5: - wavs_, labels_ = self.__load_from_disk_and_storage(speaker) - break - else: - # don't sample from storage, but from HDD - wavs_, labels_ = self.__load_from_disk_and_storage(speaker) - - # append speaker for control - speakers.add(labels_[0]) - - # remove current speaker and append other - if speaker_id in speakers_id_in_batch: - speakers_id_in_batch.remove(speaker_id) - - speakers_id_in_batch.add(labels_[0]) - - # get a random subset of each of the wavs and extract mel spectrograms. - feats_ = [] - for wav in wavs_: - offset = random.randint(0, wav.shape[0] - self.seq_len) - wav = wav[offset : offset + self.seq_len] - # add random gaussian noise - if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]: - if random.random() < self.gaussian_augmentation_config["p"]: - wav += np.random.normal( - self.gaussian_augmentation_config["min_amplitude"], - self.gaussian_augmentation_config["max_amplitude"], - size=len(wav), - ) - - if not self.use_torch_spec: - mel = self.ap.melspectrogram(wav) - feats_.append(torch.FloatTensor(mel)) - else: - feats_.append(torch.FloatTensor(wav)) - - labels.append(torch.LongTensor(labels_)) - feats.extend(feats_) - - feats = torch.stack(feats) - labels = torch.stack(labels) - - return feats, labels diff --git a/TTS/speaker_encoder/models/lstm.py b/TTS/speaker_encoder/models/lstm.py deleted file mode 100644 index ec394cdb..00000000 --- a/TTS/speaker_encoder/models/lstm.py +++ /dev/null @@ -1,189 +0,0 @@ -import numpy as np -import torch -import torchaudio -from torch import nn - -from TTS.speaker_encoder.models.resnet import PreEmphasis -from TTS.utils.io import load_fsspec - - -class LSTMWithProjection(nn.Module): - def __init__(self, input_size, hidden_size, proj_size): - super().__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.proj_size = proj_size - self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) - self.linear = nn.Linear(hidden_size, proj_size, bias=False) - - def forward(self, x): - self.lstm.flatten_parameters() - o, (_, _) = self.lstm(x) - return self.linear(o) - - -class LSTMWithoutProjection(nn.Module): - def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): - super().__init__() - self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True) - self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) - self.relu = nn.ReLU() - - def forward(self, x): - _, (hidden, _) = self.lstm(x) - return self.relu(self.linear(hidden[-1])) - - -class LSTMSpeakerEncoder(nn.Module): - def __init__( - self, - input_dim, - proj_dim=256, - lstm_dim=768, - num_lstm_layers=3, - use_lstm_with_projection=True, - use_torch_spec=False, - audio_config=None, - ): - super().__init__() - self.use_lstm_with_projection = use_lstm_with_projection - self.use_torch_spec = use_torch_spec - self.audio_config = audio_config - self.proj_dim = proj_dim - - layers = [] - # choise LSTM layer - if use_lstm_with_projection: - layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) - for _ in range(num_lstm_layers - 1): - layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) - self.layers = nn.Sequential(*layers) - else: - self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) - - self.instancenorm = nn.InstanceNorm1d(input_dim) - - if self.use_torch_spec: - self.torch_spec = torch.nn.Sequential( - PreEmphasis(audio_config["preemphasis"]), - # TorchSTFT( - # n_fft=audio_config["fft_size"], - # hop_length=audio_config["hop_length"], - # win_length=audio_config["win_length"], - # sample_rate=audio_config["sample_rate"], - # window="hamming_window", - # mel_fmin=0.0, - # mel_fmax=None, - # use_htk=True, - # do_amp_to_db=False, - # n_mels=audio_config["num_mels"], - # power=2.0, - # use_mel=True, - # mel_norm=None, - # ) - torchaudio.transforms.MelSpectrogram( - sample_rate=audio_config["sample_rate"], - n_fft=audio_config["fft_size"], - win_length=audio_config["win_length"], - hop_length=audio_config["hop_length"], - window_fn=torch.hamming_window, - n_mels=audio_config["num_mels"], - ), - ) - else: - self.torch_spec = None - - self._init_layers() - - def _init_layers(self): - for name, param in self.layers.named_parameters(): - if "bias" in name: - nn.init.constant_(param, 0.0) - elif "weight" in name: - nn.init.xavier_normal_(param) - - def forward(self, x, l2_norm=True): - """Forward pass of the model. - - Args: - x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` - to compute the spectrogram on-the-fly. - l2_norm (bool): Whether to L2-normalize the outputs. - - Shapes: - - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` - """ - with torch.no_grad(): - with torch.cuda.amp.autocast(enabled=False): - if self.use_torch_spec: - x.squeeze_(1) - x = self.torch_spec(x) - x = self.instancenorm(x).transpose(1, 2) - d = self.layers(x) - if self.use_lstm_with_projection: - d = d[:, -1] - if l2_norm: - d = torch.nn.functional.normalize(d, p=2, dim=1) - return d - - @torch.no_grad() - def inference(self, x, l2_norm=True): - d = self.forward(x, l2_norm=l2_norm) - return d - - def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): - """ - Generate embeddings for a batch of utterances - x: 1xTxD - """ - max_len = x.shape[1] - - if max_len < num_frames: - num_frames = max_len - - offsets = np.linspace(0, max_len - num_frames, num=num_eval) - - frames_batch = [] - for offset in offsets: - offset = int(offset) - end_offset = int(offset + num_frames) - frames = x[:, offset:end_offset] - frames_batch.append(frames) - - frames_batch = torch.cat(frames_batch, dim=0) - embeddings = self.inference(frames_batch) - - if return_mean: - embeddings = torch.mean(embeddings, dim=0, keepdim=True) - - return embeddings - - def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5): - """ - Generate embeddings for a batch of utterances - x: BxTxD - """ - num_overlap = num_frames * overlap - max_len = x.shape[1] - embed = None - num_iters = seq_lens / (num_frames - num_overlap) - cur_iter = 0 - for offset in range(0, max_len, num_frames - num_overlap): - cur_iter += 1 - end_offset = min(x.shape[1], offset + num_frames) - frames = x[:, offset:end_offset] - if embed is None: - embed = self.inference(frames) - else: - embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :]) - return embed / num_iters - - # pylint: disable=unused-argument, redefined-builtin - def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): - state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) - self.load_state_dict(state["model"]) - if use_cuda: - self.cuda() - if eval: - self.eval() - assert not self.training diff --git a/TTS/speaker_encoder/umap.png b/TTS/speaker_encoder/umap.png deleted file mode 100644 index ca8aefeac8cbe616983b35e968c9c9133eb41ede..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24616 zcma&OWmHvR*EYHd38hm)K^g(+PU#To?(UH8P`bNAx=TuGBhuZufh{229h*1{pZER7 z8Q*z+9L8n<8`fI)J@1*sWp0s;sGcY^{CyptYL9146v zb(Gd}27xfco0vDH=KF8G)!m|%zo zhVni_tl}HSUCdBeK@86scs*+DW^Dg^Kum!uHnw7%FGC!G_LA}rxKJ*maPc~`jd@$( zm~4A%YfEvdSU!{E`uag^+%sqUtx$=4qCt+DH2p_$aq*-iJQFx%}u%%7MmRmsBq|+m^jsS|6cgM48rP*K#e%& zucMxY(aa8iJ?5W`uDEW59DXjD*5s4yxmi?@Mme35W3JL|nbQ0c|0-BCqnMWz!=x9n z_d0taVb2#KFd(0fD|1wn_2a28ZPS-a@R}nJKR-m%OIBF$W}Z;TT@(LYkdwReecC3A zRy5Pt9M2vuykVoZii%2f5n-nzkH9BM#?O#tQ@RL#}eKsD{-Izr9c>mNRN`g!w zF@t}{`~C(p-fS!U*RBrboH-U;OgFTCFqp2MT6d{MWbmK;W~w#@(Eb!AJ%pps3mmYC zKBCojp9^kl*Om@km}w*vb-Pnna82fC!YEl$->KC}uFt(5`6sXq331nJn7fOYK8ECH zP*=CF$3eg@c|)8;AO$&A{ABcDXu)dxV6|fk13UXHCxf7!m$r7(s5M`Omf|N-K;Rr7 zq|3a%A!9VGY@olHGhL`mgTeFW?7)wj2L{N5aYPm@r`uer+uKd*r}VEKuxK$AVe})^ zo9nBj<(dg*4vPKHO~hyP6?VRJ`)DR&gVa^jTYY7#D=H=2Bu!nMth;`U5Lr#`bzWR} zDCUa%Jo1>3j)sEQWr%M;Yh@Qg|sOH?w9qDPEnbMj&(pVBrfh)TV6-9^-<0syIC3Q!sL|^39)CSLr3h3J8LLzqV zqWAA5$af^=Zj9W~O8VdF8;k}jY_5{(GA#`A<(jlb;x=q5{0LoQ58wiXK(E-RpbPF0 zcs5Jaknh$k(LU%z|Fkxmv+TEC>e7X^=LUiM+{N9WkMZm5>B^D1i58p8vCTsb+U$YK zq21dGRwyq-AK*fTvyRX&F3O|birVNBwmD*5$PVKzXrGNjiQ3!U2ANEqEL0Sxl$E2+o%mMI^lnSgC8#fcdTjFk9PtYX9e0`WhZEhsLf= zYo$uz+G;#AXtfi)boB4!&8fQ=@IY}U+*Nc=x+=-U*m33(n(~6mGj73Te1EvO1mFC# zJ7SwId+~?3@}&0^4`nNDvZ9NgL#Fi!=C-IH7P%@$V(plTD1p#f#vlsO4BIOy1G_7t z%p8pW$bePJktoZIrAV5-h+uXZS()#;_$TDk(x66AnGxsI!p6e%Pia((g-|h$TKtt)P!%(f?rc)U?-v%oSHNr3G`{qQb zumr&cW>2}oj5;bxU`Kt$jNjOW@_?qxh}XSO zJf8SUuz1Bl{~&ArAq&G&`Ra(z3g_$^CI*%scvI$Yu3!{eW4AgIF?ido!IQcPiI!Ke zU`(7?YP7+1IPQFU?9qAlM+Bne)P!a1oQ#T@SMMKv;*)CzD0)c)7ig914dt+#@T4X7 zFATS_YZ~rW;H2|P32X&_P|(^vHZ9#Lu$|fZR87D-@ zS*L5qzDIb-*C=Pw|57RcQwR^E#A=I8<;rxCqM+tL)Vtn&DCw67q9`3rq}eZFYP>A# z2y;y-0U4C%^ESzc5n^86k3U&TjQhwYhRW{s?|HM{VHm|A`bFATgFOMfz0<*g%T`8*p1QgHn`KeSeTPQ#9W=P)lq zPDM=)svk_qNIsGl9QOZcF&zt>ghd{(Q1tfu2sES(Rys+w4qY};#I%<~jLsi8@zs9Y z29{fO)e_gD@QQgeaGu#_w{G*0b8@a9?;TYgpSgL#p}=Xxu$R2&Z~SHEvz%^}i-kQI z{AZ=1?08W4qwEeYA1;?nd_YvW*tNLK@E(iWXr2Ef%zFA=RgKR6b!orP-oD=`gyG?x zxEd&fAmSxwJ)_u`yz)qK5}Nm7enNhFFhNWVHg>cQLJfH7EZQ3+ErK>K^78w_;TS$c zh-`R5N`BZ3<9XEPP@(nixwOAx#Zl}{tRz$jtE~C$9hnWZ|i@ zy~2Je)l4=m@X<^;*x@hQ{DxF!6q|D6?6`Aao?k&~Utlkno4 z{U*nuAYSzxEVpGj{^%h1$Y) z@|*6N$o?QGy6cqH9eo9EJ~J^)VA;evZKt-^>D?8hXG?gQM7np&)iN&9e?Fkyp+o zNWC4rAF;@KMSw!Rrc;rW@;kp_znXn}`&FWr0`FLnoG%0_{NwIywg1tusxrYE5A~a= z7u8E{ZdxJgsVO!WDe(de#%;h!$y3ekm47{kB9kvqOgvrWc4Trx-oIC_0lzm}R0bt- z&s_<_Dz`7^^%&GO87eZsCog=};~dWU$g75ni)Nv<4DdMs7FSaXB^Q<6RexxXjwMf`IEKeg$0n)fNOK(^+t7=aBR1AK6Y^Chy<7;ad zGSJqx{;0p2v{xP#<nar^NrByw4iWt0hRMN^oyKdo7iFb_lj|7roRam4 ztaUzA_|ACDEnttO%3M&MYhYw`2b*%>ONy@}X(b z*6X%%cN)5QrmrKl9e=k$-JAIG>^#Kxz(q186UZm=(t#v!x_GffeuI@*{7a|*dRffa z2NU{<2^Ue>T_-+fojS(1-#`n8O=fZgki#oW4sGExLFx48WORpOVr3fPJGt<cqKFb+;vgdVYc!($&)Zb7lt7|D z%)pTHfr|hR#Ase$Uans=uL&n!pd^T}Xsd3rvRkY0TL22IUP?+*`i*-~;aVE>91Qmqar z2}!S3KRs1N#}HPOfcb52SoUb&)bw*+0;Ca?A5y2uPC_ZyR8uRUhe$y6m`zWA=Sj?! z$<(_xQKLeNQ88QI{Dn6=Uiz=N^YBS%_FKT7pD$!?KjRCyZ(pvc(X2AiP)n-OY}M?{ z;VLqpyl=U?%)fQ6U|_h|pJmYasY|<@8hD){%xB&Z`0uxL>ABrk&Sl!Hr7$UjaK(hr2lti z|4#*y+W)Qn|FhVKe>MQg^lFK@l>?idN2&As|A%2br~#)2@TN9bttIv+1ZhpUGJ%R!JaOgn7@O+aoO$P%gjNe^?LJ72MWi?&$Nye3z ziJ)E)PkwsFk^>9v4WG}?`Zn6vFYTTjx4-h%pPxHLEAS$0b2MBjy^ZbPboy5Sg^i^_ z$rf5H0N6tfuPDFLr8hx>cQKtSQ%wmEE)B2Oy0N&V=tG9EzxJqUgwv**;(5$xUIZ=XyJK*J0_MHO~-%U`6^a6cuYl=sazP< zxi<@1f;@lV4j;E{=UiM|4~OHZCu(F5hv{sZ$1_Ac;xs?ep`s zO-)Z{(T`bjwAvhlzuvd+ML4ys=xeFZ&Cf%ZRws|$uG^4-I2WZLPz0=!8|veH@3qqE zIxcv<-1QBakWn*k0uiKMrTxNizFP0aix;QA0gty>Z;_gutW~96F-y$cxezS+j($_z zt3YJ~_SshE5B5YX2*5H1mTEIs$*Qh<|9j2R+Pb{XWt~{#c>idR=k8IRDEy#m1?vp$3daQ(@A_<*50$Msf?ezy*alyP+8_Dw!i;oXK&%` zf()wtWYf`J@HoIO{P@vb<=ZMR?}Dzhyv=eO11ZmJ4J8mmLh_aH0kkgy3j_qeUgNnM zT-0}r1d-!pkwSO+A$Jy1rhv%++SPg8aSRyWSnT!Veg7S%JE-n^LD^(06Nyag_eYI+%u znYD*Ja^>m*vlIDUIRPX1ton(wJpsrT(uOzp4)2=ma01S0?b0GhChP+-GFM0A=B_Y&+^QdTLap!EQT8S zQoJ&)#a!<{CAqNqCqm{K6D7Jo(5sO|v0&hE=FXg*0XJJ-9vVYBD`!5Q2B<`f=l&I@ z`|+ZG?NI|iPBaQEXeWPieoA1WhTUf?7F38>h$jkCJ10aK`0(LjE0NjWcWW?aLn?LI zMdza{dO+Mb10J;zJSPYx(dt;IX0-KwuSfJbBP;u$VeX}E;jUnzaXoprF`JA}kI~=0 zSiiKyPfVQI>g6RTM^n_Qdlzpy*>wu17QIJJFg);MbU+(7`kjSalVv3`2u1J;D8EE^ zR>X|g8o854(=}M#jqWMsp0=h>Z9rTl6)sx`eS&Q`uQ=6)D@s2z0_6c~A>9xVfMCsb zm{Jr{d$9}=%3AtJ-)qS`YsrOIKat@@!W^&lAJ#)`s%064nPOrlB`BHrEp{YhKTe2Y zgZ`?iR<)j2_q@JP5*(F(p{6qB6oO8*t){-IapEEX7#P*;W){D9@nE5N!Q?~e-oNx3 zGwSHOCh4=>>b8!-t=FUT9BPr#QK&E2nhf#>ZS_vVGjT}#RTZ440ki26(MVmiM))%{ z6chASCDnj$yL>-NK0M5_a&z-9V;aB3I2fTzv9z(HprQ?kPfL|t6|*=PQUs&9ZI+J$ zpmO3_?{DP{3~cO@Zet-afacLeI)GirL*^e49zbUd@9&nhy8{D^*)o|3+_!6|eQ>iE zI-5=VDpwrZ4-OAsV4;VC5Cw7^?$?M+RDfe53&Of>X&EM9Uao%%fCyrROTz9}!qG31 zjqjGpLFq(97R_!^&re|xdT4(B)RRXisIcpo^FJ)D2@NVPf7j^`{2+L7MC0!6ZFHHW z<_~kfO-_Wf_1nX2)Ay#|xzs#xiyW<{zN(b98xPU2sA(}T1=5W^qSWaYm$k2c*COr? z4jLNnEO=+g@bnm=bt}AvIQfwZ_Ex+BV<9F1q@?s>R9HKg+z32h9HF^nH;=TP*WF*s z2O&AWvIg`7s7RD~it8Ed@@llq`UGu&4xGv=B~^|@0k4d5S(7)0b!k$35=#Ke{z3gT zQ5{Yiq`b!~C4%U?`al9bu&yM0ou!E`g3(7KLYMD&#v!Hc_3Qu=a1*igD$j-)Vmmt3 zI@N>X0wy%#;Xro~opXi?>V}@G&|=2U8Qc0R4oi0v9BsuJE?kNuG*33w(#BT*kSWVbA?9D=Ta)-bim!XU1iuP96-;fzCGoo zS}HZRm7Ruog9SYYUxH$NZZSZ2XLSMn_4~8i&$XE)fX9}rRYXaMZ~sbvl}d$Fi&#iQmz@%TTP0NrD`C|Ha=KZm=1~80^ zWuNDIgD`dV{?MN_1s?R_fpq>o@W(8lL^=Fs?K~}jTivZZOiI80g?{Z*V!lfa1V;e8 z3t%s{x?eh-Vn?m-4EIBH0&hk%)LvBqyS@`IpeNznOEm5B&O^14mXyhT zAqu~w;p~c^RzAJy`f&c6zJZY|x`rRnO(`|O$v;1G_TGaYd=U#vuTJu(8||QHo?ckr zP#ZJQ<@K+tO9IX^i3N3X_wXF4>B5si#r6f4T=+_cJr~>#n$%~+?pIS;eRJKTD?d5~ z@m?TOU7_C`3or5IQvr*1(FZnTb>EQpP|^>1fyF-}_ebScD-+hi)Ws6OQNK$zeD(z9 za9FTFTJdja6vPWe3<6Ohwo*=}wk_ul$a8Z61rG76PoggGSK@aRVw({}_dZI)aR|)5 z<)3{Oz+W!p*nSv(wYp`b|2RT<3MMM;yzWC;2DX@?dpaAuP z-H~K`c5!p_^7XTbpk(H5&adPE`fb#`Mz(^YKvgwYV*p)Puw*PcE#!TTME=fXO+ub$ z{0)d4F+Zd7JQ%baiS$&TcUVh%xx@iuS_H8OGi@H`062HgUDDKOv6_}uxI&nBmG^g6 zEO^DOTD@%NGcKUMw~cw@u;6|&@hd=TXyH%LI2q6C!pwvV1rw+1(o9B1`OsSK>vW*V z23gzKL`!hw6sF5dqX~zsW-ZMB{gaGEeBu+kefxHtHSLIr`DYu4LndTfgR?^^xcjOg zL*{gx>P}DYx*3x@GZ=)Ge6||#)qD6=RP=LShO|l;Y}X%}QCw71oPtM)k55ooRP5CO zPHAew9j!VF3_Ro)QdM&W%90J*@dZtqyhZ?p)NAANW4OpcAXbPtoMQ{U6erJztLtL& z?TdSKzdZFfiEX}G*3oygGT&3Mj_{wJn#Mu%t#3CxEF?8)``Br}8zQc1jRlV*EVv4G zQ~n%d#@HzYgeP0h<|ex*!;6$l*Bcx?{}MUnUt z007pg$rz%YIm_pNWGg=%Ls|i?b9T4?++@e{K6-%esW30Rgr@g#W3cxW(*5{HNa4=| z9JGLG1w?Mf2=3vFG*&k&>Z@_3{kq2Oh4e|JrQ2=M&`yp;V9@?WoEcZMP2EXhq@_n* zJ=w3{E*5#CH<*78D!uLWI@U zQy)SwF?E8l;*7!##_2lWF;kufTCAnP(%ybsr;bL5e)c~cF+Y)!t`7KF09jcK4Ps^| zDL1*hHMzU&I&)vW!Q=` zO5%5X%~tCVl~AywZ~2u|DhbYH*XIJ{Og?fDIu?rM(l`0T&UCGJ0SMrLq_j1s_T^0^ zETT_LPd&{2($uXIJGx>MD z*-Mbw{R$#KwA_H_TQ<*;+P!fa)VsNb)WKd#nny(ttFBZxHHJS);^I<~as zi%(AX9GvPc`;b~UxeGXBt@rK=V3xtF;8+6;g6rG-tXxi)9vU2U`S(+(pt!KAdNNhp zdQwcZK6=zpEnYh`_&-N4>1guGwHZyVeiU>~Px63JrQ1&-56%|4t)sUg-3KV*2-8l* zE36L~8XNEMiNR=#`nJ3Zp$j+ z1??541?{vCH)qrg$$NL$kiSQUdEyXeqoz^W54_>Jxh(ry8T#49OAQ@|p61lOw`5%B zix*mio~Wp-h%{W;(5-iMu0yNQ(+v0gh_>f&c?GVp1c3YX>?JdRY=IKJTPhZRYkNGG zAF^P~Q@_~mbjvop_<>t!I2Jp4z`;V*c8AELvazkl2(o+S%kqctPL@3|{j*f{?7eI= z|NBkaa}ygxux1uNJLhK07`Qa4nb5R$)!9K6MM0I<6>ps-bdvjMIJ_wyb7KKt9mXR&AV2SIn$D= z!I%MCvXlTxE@6i-x;zyHwn1scsF#=z<>1QiBFUE-jkgmv*kepPd4{6qDtqAnVuqAni&uB$s>1S8odT$xc!AT>4jqeb^W zih9=N`xy-C{G=thQm0B5i8*0oO5rPa!7{wV?0Qeld4h#j#GvrF@`pX~lDb_K`OZSi zLxm45j|}zM>FU0^BMZigJcUi9M?kU&SPTGnZjWTaeLotQ#B@Il7Y|W?+*4Is>2QAv z6mt~~^Q^bX)tJIw*4SZ};g+d3Oht@GEL2Pp%v+ z-uZM8L|-Gr1#8DWfKqM%F9;XMDbZZ}Q_ zX|7XKyX2io^ETV9#RQr+H|)N$(Y>~-ZBK>DX&W7& zLMD@gBU!xDnjE5^M3cn*mL?tpA%yWaK1Or$w}eOJcT1~*(58#_)kFP^wY&8~(xWnY zaT)tt9id;y-;bJN1<_#3lw(B3IQgbut{?jdTki_u;uhAfI)wjt8#>k1O?pmX$1MAP zFJs0D?wRw0&iNgm&CQUh;*ZJ5vjTZs{;QbtAUdt~>Fsb?R8-JMdGE@``Gxg+kCysQ zVVoBrI1yPxx#M4r{4utBm#?(}tqks`5+5`cQG1ss-p1JV4J4W|hlN5!l8vmhB9!h} zo)d>7Y8huxV?+ENxI63bNmf|`eEO!jzNa6s)feB>*;MLl7~DrVx%e<2^BA8R?cEh+ zL2Yy{dn&p8(z0VY5k<>zOHDir*obMmiI4jd#rF?Z)6s1FWD8T zA(+LJIPg2~nROqn)Q=;q0fheiM8xk+O>>&!9K?R@5B<*~q(4(kFB$f=EcJ3ssvMoa z>gl3A((!^S&T85$&li-5WotxxdsDodHD-r9xj-VhzLb`SpWs0KzAJ+H+#zjw{hy8% zkc;*?;BGr&Wk+tu2+os`HtEIDm^ML{l39Hg+XIr$9HDDQ!5e{}vUJdUsX!Nvw-^q` zEa^$51aclAFC8DTL%`|$1zIE6*_i#y{cl%qYwPe%AhYX9GNX?vc@(H1H6lVw%L6pE zks;pN7PV;WJG;}F5lnr%<)_b)&^PKMqLDNst~fiEj-7WUuWzA7sfiTPbQy9Wq zphxJ%2DFUQ&*~DH6{2OrXfl|XbhLe|#IE7v0(xAF%?w%&Es(u1d^SgGE)c!eFEvv1 zcxOB=NFTmCLk#wjy}p$#HkHXm68PnhEUcU8dU;2ea8J06FmY<}QF`~%R*=CSz3)8I zqhufKf+z}$k5ifUgb(<>)E-~?YaI-casl3ACx7c2nQO*=Z{MAS;WG`>Y@_*QQCW4p z797ww!yOGNMi0TkO)A?=DEDZ50J8Lbk z>6^V}faF5>wg&LDzM~nlx1J?S&WIuaX2;;rJ_|iI$C0HmBICm(FTktzF;SapvzgO# zcVB+fsrFsKb3;Xit)xe8%GQco9qM6n8GV|dw7rXNT+1!n{1AKx<}w((sQfx~5fp;r zK?w#MS~qp4IGy=NrQg!W*QVUiwM{QC?qJ^$nX}6JTaf#SiE|ibaUsDC43DxBME4AK z-cAKifC;lTrkFlJCdm3JT1eft--mS-PdBR@unkfq!U znlFDHaWeV##a3(b$63P}jY1*~XFUsHO9YO|L~tSXi1(4r^|4QZ6)dtpn`ka;T~(u4 zu#@fZTPn$lo?l0oS+hdA=VD>kme|Yksq^7!K4K)Q?;srMaaWF`LzygPbi-SyDrOrk zMF!pD*>a@5!GX@ElonG8#G$QMlMm;UD50(|lm1U+-^*Wr#$|ku`t3TqqcTGq1_Pf20--g`Vy%_uIy#;kvyH)EWC`rsP3w@l=oJhw-Ed%-ZzBBMg z!Gb~UCMzyzNODUnGIF=28_|SJ2GsnyHx|aKS4ckdOxtbN*KZrBG`X8Zf>ZRjKQ+I0 zGoS)UC)FK$5k1EqlH2V^Su#8DF&YqPSRL52XSZ>qOLf+5Q1D0bRK@ zG$h2U5=W|}Uf8ZR`TJN-)-0E}X7W^e74S-|}@eVa-Q`j+4v2d9oE)sqh~1#DUf0 zR-Xe{<05C6??;_QMYqqT9y`?Ytk$30_6MD{s_2mB-%0mKX=!Pvld);siNuLM;knu4 zBewM9-Bdhz1Er*tjWl<&-F61KjdA9TtjYXwT=+`2^QdIn-A@41=Cw9ti84N{&Lu8C zt@x8riBx>jKo}Fi@RFsU@3XJjf*cuK)`d(&v3qsE6Ce#<} z8ZlzReK2N)_YLToL@$Mg`R~3}Cwb$yoIDtE)J>>?!-s3&dCmY?Y6_l-3-H+5VLuGK zlB)JV{3X`*cFIIiH?zj>hpUtEV0x*3oJI$E4J?QjXD;U<7LzI@9}}l~1L$-!^}_!t z3ccbW+l!Xu06}tV9F`&MzGg}vJ+;mAoKL!c-sX5ha)iMS8>rspjpjy=K<2xy2DFez zjyhlpf_!~cF7c(_i<-t*yqg4`VjpFnCV2ot<8~CD+5FBtNoDDrv);OlT{MR%QD!9j}G4qrH11PB z9wiMuo`Px^){y9QLe7#cm(VyP8Yo6O+0|F?g>r5xEGB4uSrsv>DTrhJ`}3$Miu6x0`_$e>x4tD97j!%JZbQ#4c5Wjl$-)M^GIIW+>1#8#396Q+b8`!F!a+HUt0zezX%8;YkUgH_bhGUyIQ{9FlAGA~`@_BG(DQw_K>g_5%M4*+U@uG|2 ztw3s`ibiUsU)^JGmmf1Dv$oJ6C+e@M{33sqbMA36-&NeJzkE)e^*8*K*;E%wo@Qmh zOU}L$@g&tBI;k&=RUP&;aasb*s4tGjVjm<$HYSjxeIc>e zcoxOWEF}Y)QsR`XzSi9QQF)xJ6DXzj}sQkv@T}iUwKXh%>AuP z|N1OVG`s`oOAR;LIRY1dg_&=+gJ>wvFK-QwUtQVRub>&qtR?tsT`_xkW^f+zM}p6@ zC7`@B-j>#Lr5}b6Ko?DSI8yyctq%w*fw-Z@)43ZVkKd`u$*HH#$DAlVTYts4dUoF3 zjCE%t&`5-fH&$H=3b&f20FY8Pk^FSAJkewK4kZWE+B__tWRt4Fs5?Z*p+oH$x%GUE z-=SqQrMS3wYATPDd61!=)c>01v#r?mTAmem@2f8dOgG-O>oxje&B-7cX4jvC^hIue@eEYxG#PF{A#HMyB!OS zpnRjG4{zrF_;?xa*T(axsqB00 z$T6fV&xyE3lx=u8pck)2a>>n zMgq4e>x7l z0$+j_M4Z+ZgTn90*8XQY$WO;_|=xr$y(0Q(P~pn3}*$i{1+Y^ zlA!MRQJrQ?G!GDC8aF@lHlusS+f6DwpqV_E=YIkapk9-LsWLxz#Ru9vqSyFwr#KvZ zMhwXnNacS~Gz>C4Q{!ocyKGeKcHDfoE@gQmPT@_=#xT)}TMf^s%grIPqid?R?#JCE z5Vmn_+$)@AsVPVp=9DC#Ld*?{`K1g; zzB~G))yn0%-v@THm1XqEPk)zSnw7dBWi(aY?P$UNkl80{btL(m6if4Dn`?*ZrpNw2 zf6@?xYVIVje;oG-z50Xpv~Q7rubu>bnYEQ}aR;{eB+}zIAT9D|NoKyaZQ0)EfT-7R z_DOfzxuzMsO0t)o$hjS|PC~HB$i_u1Na+QYIk!s$c&IMd7*%u}MU)JBAPVx8z`0o8 znlHxU$KFc7FxA^w+Pc59EArvZ8A%5IP8uOAr0Qk|73@N!RWgyNw^m%(I}UcP2sc~K z{i`;l0X;S_2%Nx4c*Jmp!ql2@h|nDvmjrrk%-T}^qomnkAI7geQhW!u21w_4{Bc06Pby~v z2I3BaAd!Is>+?(DHxO1iGyGlJTVnnRuRjX? zPk5@|KuxZ8koo50$D-?+5Bhet?qg&9uow&68{dw+ood{pV(_o^vynYr+JL}7xcbGc z1UB1huMiu$PT7Z4JbDOc$%h%A)-)rOmUcZrQ5q zf2#xb@pN;$Du@;0vs-FNygI7XZhZgP`Q>`qdHnwNe7;gEQIiKQsM|<>+O{cv{O=F^ zKPd*ru&3{4$TXw<_xyH5FZ#Gz?}>fvEPVKyK$4GSB%BscV6qy@1y)<+w|~OB6+9^3 zMjCrsj*HEv&+gagJFr8vJE|fSho*)=0=^ekf=IhQN>KM^pLrvUh1jTqOE#n@RV=hn6xs58=Ie zmbRBehIlw>T1k%v&ExaJm!Vbir>u=XF*EP_XLJ9`K;7XtU@^&#noxN29vTfb*@4#X zeo0m&qnX4BH4VMl#l_pp8h=5&m!ny{=U~_od((I9bZ7ppWqZbZo{-%I+yS%R)DExv z*&dd^;T|CI29yBG2dNyPvkUgnT&pt*NCX!%S38~5ce zp>;_+q`DfOE7#3F!ufut7F7!l8y@9HWf%sb`fkg<_d1#2A{2?5yV6>M)#V>8qw{Vt znNri@KN|IgeH~azL8HEpQ?Xjr*U_Abopt7C56q-D-QMU*^6)I_LFId}Z~fpcQEbnL zgM4ItzMGz_J`wTh`J~>vxWWJ|B8*|*dHGo3#IU|08UO8f#O`xR7#Nut52eKI1Xx(* zqj44xm^$X>BGVRW(0;o&iSS$;h5BU zw77(%gmEBEqc`DlTcxFzu~AmQK!e$b_dKz3gO=^QQ@g&lHXFs=daDyBf~`ot$E{Yo z7_UImns}#W!M!0!<30;MRcGX?KtM6v-AFz2UTm;R?~9Q za#fX1!8Z~mbqiI6;hZa|#$z+>fz@eKyVHF#CZEs(wB8`OJRqQF7Qdjb3rCOYad*5e zcWJV+=J5R|cb~O{GX-WwnQ7=ololtCGH%Ymxw~mg%^A2-RiXWw%zf$c084_ z&GvlDoJA~`vo!`Uzk?zC9FipLGgkD7GkL4z9m#Y_Qi`A z=LT7~5{WE!_KTIWmqR}ojx&5ySM@HeyDRj-)w4wv$5~%W9^+MQ_GT)}PMz7C8ti$G zgus=KT4s`)e^ji#dKfL%`I=w6ZVk|k$B%5z^2ELh1tgy{gi#d zj_W@wWw0SJ*Xr9X?|)$LlraPsy~b7kh>xIl+g|MSv0O)(hpPc)tAF0<(g3T3Xuv7iAafcAl-?Ke-!9;WA z;*Fx3s%35H^bv+Wf5n9P+qq2rtaSK4JF(Rd=g6bu$JQZAKW}s)eS_n@-y|yz0*r*EBz)@P;i?ll3r;hFIO6Gj}xKzoT!%g1i? zZ%q|ZV||3=Uc<+Tus%eKPq@IK_y1DRIa3Nu_UCIHY;B(bY=BBb2QVZ+GjnsqkvZ+)bEBc)>;Dfr`~NjG{$D#6|CI;--!B~L`}*pc`SgDVD#Qtg zG-=*}D%7hY4Cd$N`f7k7)vyl%|5>kmd&kow4A_oiAK2zoCgSioJzWd-nCj5 zjtpU&o3S8&d@0V4@W4!lIeY2Z)ZixEDqP#lUndCAbF2XROmBB(;mDxcNCb$ykaxEH zjv~w&Cm?D3I_(GqBK!UPe*r-7t{ou?z%)WOu~h$#F%I7xa{)0(I`6jNH>0UfEn1k{ z&H6kDiHqZQ)j?&&sMqWZ6O*itN`@vdDS!xcFT;ZNV6%^Lh=?E&NlyBA;_V_Rkcs*F z>y^1c^k^@!Q4*qs2^5J!J$)Pp5#Wkv)+5ytVS^)przR>2z#-(0l?sJvNF|?Nk#o7x zfQaVuSKypuDA$Mp#>@fGk|9KQE*9L_+PkSrVq3!f8U*SwREUp5If$$wC((AOF-T{# z`v#y?;vj%Ar>iUXD)LCjTDBQZR0J(1s#j_RsV#&&G#Nv`Ot7nq4QQY}^Y{4TKj-dY zz&X_xNMT$)^w{T>kfZv3>4+MqVk@eD9(xEe0;q$Z3RlRE592|=yxA-%$rX;+6Z=J{ zC@^H;Eni%let`)X!!9rb?P6zFRTsA7@--B;?knO&@q?hvX>K&^>V>yor|{BS5LM7B z31a_p`$W7x5isBMgX5zv!{i3n6QZ z(Cw!xJ2nVUgbnOt4N&l6-;?W)*FA^y_V2_*@DT1h>Re@rgY|8&ng?vDAB zr>`TgJ~TpVMEvpsFatpaQaoj8%Q_;_Z-wu>N8j3f*;51@0JR{XDmQ7)q`Ml)B?fBt zm7sPA%zeXPF4X;hVzr&3y+z20>WLpp2m2nMZ$jk>4#1 zEbK<1k^qg*+=N#l%yK4us;ZA7Q^p!6F0A}6pB*~K1-wkyfjmkufkQnh+2NT|IZ9?oc_3pa!=DoYt{qfF^rWbuqRh_C`-~RSjRlCLTkjYSc^_4oa zpo~-DoQXspe`7D~J~qQ0fpD!~;LK@{Pn>E&>);RwC-e~nV){A%K?Gt|5fU*Vwvn)e zgYZ@S9zIi0P{1arn|TL;_!ZCG#LtfujuQk4tMX|BBp)&UDM}?XKmXF|YKTC}5t&0! zirs*~KNk}j01wwF%OMaqO!&_r5U)j|P9PAla!#!^>>p~K9dRVzCHPSa9yEVjr*}+D zPBu0$@baXG2aBdh_`ABgT*>yOF)v?c!oLum8BaYL;5>|(*&73>Iau|y54v%DnH%M| zJLJ7+Z)ex?`Lox^W7>kk!tQg>g6WPAD(3|R1Y~4Tet*OG~4KF&{+uz~180wje#(7fw2&B1g^Sh=PTv&={ zH73tG!A9KqId0c_Z<|D@*&GW)e#tW-SYmx8T${vNRie*PrX&eX&A>Cm8ejkNG&?){ zrCA4RZ#m*WXbw|*_8uK$&|AGcjE%S)em$Y1?BO`*dW*rz>gvyfzYSKFYGP$&yw_Qo zh+A8?$$#Y&5#&~)Pdcx4q(95f$;@+(mTYH06a!1FA)77Y` zsIEg{W*A#&Dh=Vc%czzPLvLu!`4%Zx_XAhX-hrY~oE+z%AUbt|<(tfzTYn{nm) z=Mgqa$K&y7X^Oc9w`qq(k0B69tBr+0X^({*9F9xerfa(MZH(srHLum_J<1O8%NN*` zOse!)>`41{u+qbkTrxk=6pg%W-uU9llLN(d&QvqHtMK5#BnXpio9n^zuv2%qhC;PJ zd~8gNxR}_%a2;*!qeqYKU15lcizAUpT!E$uQ*csf*muKt&Uftn6EL}76{#njKF3+v zlS}Le${r%MO-y*7|AKU$YKf}Z+GjLr-!>qq%h?Xv^p^dk7um; znEj&_AN>!<{kXT|F%9j@v}*5-A)l4j<~R|sz(=nO{P+FpE=`y*Ge_2D6+dhcLm=3f zv6aT|l`>9?D=*H-+m)#8cb1ap%gf7o{WeW69X=F*(GLCXx9atNTY2`}5LKa>G80_=;@i0~$#voAPLD9|4&CSZPhP|BL8w<&o zAA}NTUWRcQrdz}LE84Nn9R0#{jpT&*c+X~`(w0}EZ)pbnXk7>d}Lyd&>7Kf?@fBso+ zF_rP?kgyOElP9@kKp!@-a93ZyFO*HN>t_Vw2-cJ{&@>6<@Aom735KVm>2 zYVN?%G<$pdkzTRZGxC8~liPN8ci{-+A@2jD>F?i*I&nV=z9=XtC?zHJ2;C@#K1*}R z$fZ?0mSBDDPka+wC1MeX+S2XOrO`lBF$oD6 zJ4RkvpNYoER9w$Z0Pfe`bA5#w+B1Wd8!&87GGIVqXz2#`k75oU<(8C`6yy@${s}6+ zhZfLC>V1E|Aw|X;z>mh`z_TIb>3;n?ci6nL*{JZ`L){yakZ_90+TK1U?0B5-WOAp; z*0<_&o7VxtFJ`8v-`pBeDBo=sdongQ20M_tvL(vS!NfY$-PLv8@ZM3P*!bAkLfLqD z=UXM5?X@%1T7P+~6fsox@U78q*T;@DE{C!+!otD;w9xmfAy;Imjv_z(;rJ&wo7z0S zd{8jZ^y8;b)JYY)KQS$+1aDZIU?d#ZJbCTm*heTAo&zA;S(}TCj+SRQO7&f}pD+Ko z4j(%;@dx@nm|y^EC|P;#c=G zbc%UXvG=k5n1QaMcCq%IU3;@_g^8{i$^mH&abQK|4fK}%Ox*VEH;NeqQTncw1T6>2a@ zh>#lJ-pCi|vs;!hjF_Dj&(NS4E2u}R(RG$B(dLz2xC{Bu^!TDw8Yi8z&Qg57&rsal z3=hONT>2<>R%Y-)0*1JXVSg$dfFz!f|48as^7Go>#~1)1nP+Yq6z%G^$K%LIq}eGm z)qDH6)DMFLKI#e#It)xY>hcQeb*!QvFDa@M{a2@Psf$oke-J;b=&@sqi>Umi-$Txr z(H}g+6gd@}%_E~N;t+JwVnh?UM{n9>XSGYN$AHoJ@uxOg*~@J8Ed(VbMx6Rok8l;G zjYpANMOkBO@x^5X!YV^~NZ?&zv|D2J&=sUv!ebwone?CPPArF! z$V-!PXtS#fg+dc?_VTxvDt!oUc9mE|It9*JG7Jwbf=LX+uIeNl*YX#4P~`(Yc2 z7#$yu^+pBfz`bi{&IpLF`9GP5CkA^@hdsy4Yz21mIr)rczc4NA<9;liA@BaVO*+|k z(v0EM_36|-3Vp~gU{S1iI>npAH9B*%rClTf$JO~xK5@SLa{GQn)UMS{LvquIMLe-@ zP;#T3qhm_OdLvhtW$I7nprv{%CclKPm!Zw~np{X~OBJ8aeN`3GSETH8e z+r6h-?u)}5Mj);+s+^A+ONzU!RVz;N@gK(Dt6j*$adbBhW$)_X-mW_C$v=go6;D9$^mU5m@wy>}m>78+1Ul@dg*RutTR5sN4 z!9yL;Rhm0|#M;izPEAd%ar@B9R4YqlXlN)gLo=wy#CyJYW60ZadvyjV14FKdEqGm7 zZ2S0fCOvz7RZn_1&`gq;JYl=mj1jpa~Jk8UyREmQtf{X&c6Yi_CvJf0JgfY{m1 zNF*}{hsVM|`G~cSPBd|bL7z}N?;fTM9u6)mO0=e|l+WhU%L_XF{lYs1xel$18;R-G zX(2TyR}2z$R}#`^Hg?2M>b@e)zOlp*h8+uw{nfJTo<#<*i(tO3Y$@=IirzA=JTEAS zD=4@uBBH6#20KA}d#_q`{8X&+uBysND1@q8d{kU%7B3W*pR|A#;-J_;PKe(?w;Q2L~OTsP_k9V-Q_nj{$ z1s>;>y;qAUrh^f>7Vr?QENc4uK`b2j_c~U07g829M)F~OzBfsOTLD&hTxM0v)^^T~ zWW0ehp{V)-OyB6T@@MV`4+A?wBcD*O15`La;rn~^Z+F38M&^MYy>a8bJMJu0Da;DP1X)yW-+reH15%;N@6m9Sp1$?OECBBffiQDw zkQ;pQFJ=78gEW_2^O(xZsW}LYJqE?$fd7pLKmwa%g$l_cRrc=g?$*{bX&eeHc^gLZ zb>6x5Ys(L5yw*rkoJ`z=XH4b=Gcp4+^p90cEzk{cU@eP652@n!yV=K^)#DA~^ke$pY&i_-bxDIYjY zS~hmt!-8KS1G!~NE(^=L+}M|!gifjL@5cr>ySQ{m1viXK`R#62Zw!%pdtb%Gtku#S zH@CFJp(B^JzDD$P)z}BOvhH_|`?HFQT;$}~Gtw`ANe!7Cpnw`}09G%rZJ2&6%*>E< zVXgOm1O%h?4Gn_!TU%R5Z3sdQ4Y>^bH%EgY4pUWAqh;jgG$5P`I*#(2$|0{xhFwS> zl~nvFCNhD&!F^d>=lROlb{

afaJo2p7eF4&+gW0tuCjERZosSI0EKKB_2=nZ@uql3xb2y6T~h&GP-@6x_bq} z=84J4XGOumWk$kGS(%yBfX4aL?v+a6-0|`8Gfr0lr}F*8+mgvH+b!(G)zDzh^kCEA z3+WF>wL6~OFy};u^HfJg2OTo$eQaQ8h$eo^40*%*14g~7F^R3|u)(lOmqTWyq@*M} zE%+!Ooe-v~_v=bXqvQZxk;meYtnc>fw>}*Ob>iv{#bis80(mvY*3IiKfoJ#>vDny9 zmYuD8(Coz-vry9nvnve5BLk=Tokt~rX_(gWXsW6n^|V%kqkeN$k`Zzc*KfCwdwQmv zu8RGE-!?;5WnyfsHV0=;555f^vCedSc zM{e>miK0ar-q&Asta#{>RJdOcbD~3$Q{`{K7TEpQ6mN%5&rb%LLW&dD4^IxgfB$|~ zuu`i%<-VQLydOc*X-(k7(*qu~hs<6CKj*9o<^*F&0b`MH7|4uO(AL(TYD>oH@(IXL zdYJZ$CmK8T+LN}9IEq=>y zsi?$VUVgl!w--)X@64||4pX@qy9I^+-89kC3@gjh4FO|^Hmgj>arj&RF9=Ql2=~aX zlmAB$U={c$+Ve+4VD#VEmFzJ3`!gUPw{bmxcyRb`ROXawP%J!EEHOA$?fgRvdEen3 z%QyFl%kj4VRhY(7pfTAT2~JX$Rm|HQT=dI`RZy51k3c~rG*s{)tYT@+fk0W$!1{T0 zW!{?oopfGUmy^ZO>$MKLg>`$Y*Kf10Ycry$N6c)G?g?8^@vi zgtMRFtJF`~Q^)4?LKTLqq6IYD+S+QqV^Kfud9^`yovktzr&Z{N&oE`Gf)<)|UKO?o zMd+ykKQ1Lt&gl^}Fqjl@V>lB&ad9`OMsF{!x5jq|Xc0aiB`)6WP-I z0CevDB6+kFB`ByS(Ya7aj$yna2>W9WECI%g6B@Am5}EueP+ zhJv)_zA@;5JFR=$*w`0{C^}KX9%$lhLb2UYl_%ug0e}M-6+C}_bHRgvb9eRUiLjjc zk_LR}GQ)VRp#$V<(um-Z={iSmZDsXnq5P{XWmSjiBpA)p*+a!LG#j0 zy!df$tu0x41MJucYOeBJf!Tf6#3WBbKYV!6e>N-Q%a<=*hnlx0qBT?e_qKqSgOG<> zs-+2I7b=(_w71LdKw^&{L^nXK|{)N$qOB+^1!EC8XHeUCM6{W z1O)IYg=eDK3v5ZkO!1c=sH&^Cz%6kiCiuF2pBzke1&DL1 zMRU-#04hPv(MhZ*@9F6Q(PcycwG~2~QCC+_0t}B8GVD5MTTWRgTec>!f#|@?>pz!U z$|K`-dpY9r0@|5sVV9#JO{Qg4B5K68Jnpw2)sIEvBKi$bs?La8xbveD12j{ zOzQ8qsrJX>a5##e|Nb6@CIdR*I(Ql|1>wblmq3vD9x1xO`#^vlmuJ}VB~48Lo$xX~ zK1=v63>V6Ex&u@v%k~swEv@`}AD)0$#G$?Tz~n)ah8;yRsGYv{boKblFdlHDp*PLm zx4uF^E^zT66mNrJl>z_v-C550}P}xfy1}k8V!ra`PTh?c@ zQ;8R?sYw_Opck)s(Iox~N^e+52xLzm26`mojrK@y5#+$0_6EPOvp(;wkGr!9%D7Sl4{$U2 zAV#)G_()4j)8Em?H;jX+m(QdpLUM~Nynb8wY3YQ7gahP0KLY5vn#tGJNHF)-NG5qD z22WZoDnX8ZO0G<|?zfXbYmcxY>D;b^#<|UK3$~*Hc!AYJ7aN(tf=7D_mGiGVXvTZM z{01(((`>T$y?W7iaitx#Km9fWuqoxo?;EX>!wftk21S`LAKA(JT2nF*v8stw7uod{ z+$bx3*W23*WyJ{U$*%X_hp5)EYWY*L$557pz8VngU?x7-U=ZtJuF#_T$3c$1V%cU? zXwf<;r0W9HyL$N(7sRQTt9M8e{{uUX%PjJ8+6Tk`77z z=4W7@f(b+Sh*Ym?gU^BmpzG$1CyMC?d*B6OVP(2d04hk5us2W+V{1u~*@?GI*;7$-#Y|p5=oM!$sCQubEj`>Yn!KUiSPxQBhHW-CNBFG=0u3^__#T v`K&)xvQ7+L3|h!*T71O+;zRzrLbi<(A List[List[str]]: # pylint: disable=unused-argument """Normalizes the Baker meta data file to TTS format diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index c15a3abf..1a5da94a 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -9,7 +9,7 @@ import torch from coqpit import Coqpit from TTS.config import get_from_config_or_model_args_with_default, load_config -from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model +from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model from TTS.utils.audio import AudioProcessor @@ -269,7 +269,7 @@ class SpeakerManager: """ self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) - self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) + self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint(self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list: diff --git a/tests/aux_tests/test_speaker_encoder.py b/tests/aux_tests/test_speaker_encoder.py index 97b3b92f..f2875cc1 100644 --- a/tests/aux_tests/test_speaker_encoder.py +++ b/tests/aux_tests/test_speaker_encoder.py @@ -3,9 +3,9 @@ import unittest import torch as T from tests import get_tests_input_path -from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss -from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder -from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder +from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss +from TTS.encoder.models.lstm import LSTMSpeakerEncoder +from TTS.encoder.models.resnet import ResNetSpeakerEncoder file_path = get_tests_input_path() diff --git a/tests/aux_tests/test_speaker_encoder_train.py b/tests/aux_tests/test_speaker_encoder_train.py index 7901fe5a..d9d6d71e 100644 --- a/tests/aux_tests/test_speaker_encoder_train.py +++ b/tests/aux_tests/test_speaker_encoder_train.py @@ -4,14 +4,14 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig -from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig +from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig def run_test_train(): command = ( f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.name ljspeech " + "--coqpit.datasets.0.name ljspeech_test " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " @@ -24,17 +24,21 @@ output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeakerEncoderConfig( batch_size=4, - num_speakers_in_batch=1, - num_utters_per_speaker=10, - num_loader_workers=0, - max_train_step=2, + num_classes_in_batch=4, + num_utter_per_class=2, + eval_num_classes_in_batch=4, + eval_num_utter_per_class=2, + num_loader_workers=1, + epochs=1, print_step=1, - save_step=1, + save_step=2, print_eval=True, + run_eval=True, audio=BaseAudioConfig(num_mels=80), ) config.audio.do_trim_silence = True config.audio.trim_db = 60 +config.loss = "ge2e" config.save_json(config_path) print(config) @@ -69,14 +73,14 @@ run_cli(command_train) shutil.rmtree(continue_path) # test model with ge2e loss function -config.loss = "ge2e" -config.save_json(config_path) -run_test_train() +# config.loss = "ge2e" +# config.save_json(config_path) +# run_test_train() # test model with angleproto loss function -config.loss = "angleproto" -config.save_json(config_path) -run_test_train() +# config.loss = "angleproto" +# config.save_json(config_path) +# run_test_train() # test model with softmaxproto loss function config.loss = "softmaxproto" diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py index fff49b13..5fafb56a 100644 --- a/tests/aux_tests/test_speaker_manager.py +++ b/tests/aux_tests/test_speaker_manager.py @@ -6,8 +6,8 @@ import torch from tests import get_tests_input_path from TTS.config import load_config -from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model -from TTS.speaker_encoder.utils.io import save_checkpoint +from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model +from TTS.encoder.utils.io import save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index 12152fb8..c888c629 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -8,6 +8,7 @@ from TTS.config.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.languages import get_language_balancer_weights from TTS.tts.utils.speakers import get_speaker_balancer_weights +from TTS.encoder.utils.samplers import PerfectBatchSampler # Fixing random state to avoid random fails torch.manual_seed(0) @@ -82,3 +83,51 @@ class TestSamplers(unittest.TestCase): spk2 += 1 assert is_balanced(spk1, spk2), "Speaker Weighted sampler is supposed to be balanced" + + def test_perfect_sampler(self): # pylint: disable=no-self-use + classes = set() + for item in train_samples: + classes.add(item["speaker_name"]) + + sampler = PerfectBatchSampler( + train_samples, + classes, + batch_size=2 * 3, # total batch size + num_classes_in_batch=2, + label_key="speaker_name", + shuffle=False, + drop_last=True) + batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)]) + for batch in batchs: + spk1, spk2 = 0, 0 + # for in each batch + for index in batch: + if train_samples[index]["speaker_name"] == "ljspeech-0": + spk1 += 1 + else: + spk2 += 1 + assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced" + + def test_perfect_sampler_shuffle(self): # pylint: disable=no-self-use + classes = set() + for item in train_samples: + classes.add(item["speaker_name"]) + + sampler = PerfectBatchSampler( + train_samples, + classes, + batch_size=2 * 3, # total batch size + num_classes_in_batch=2, + label_key="speaker_name", + shuffle=True, + drop_last=False) + batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)]) + for batch in batchs: + spk1, spk2 = 0, 0 + # for in each batch + for index in batch: + if train_samples[index]["speaker_name"] == "ljspeech-0": + spk1 += 1 + else: + spk2 += 1 + assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced" diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json index 6dd86057..64b09828 100644 --- a/tests/inputs/test_glow_tts.json +++ b/tests/inputs/test_glow_tts.json @@ -66,8 +66,8 @@ "use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments. // TRAINING - "batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":1, + "batch_size": 8, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size": 8, "r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "loss_masking": true, // enable / disable loss masking against the sequence padding. "data_dep_init_iter": 1, diff --git a/tests/inputs/test_speaker_encoder_config.json b/tests/inputs/test_speaker_encoder_config.json index 09a2f6a4..bfcc17ab 100644 --- a/tests/inputs/test_speaker_encoder_config.json +++ b/tests/inputs/test_speaker_encoder_config.json @@ -36,8 +36,8 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "num_utters_per_speaker": 10, // + "num_classes_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "num_utter_per_class": 10, // "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" diff --git a/tests/inputs/test_tacotron2_config.json b/tests/inputs/test_tacotron2_config.json index 6c82891d..69b23560 100644 --- a/tests/inputs/test_tacotron2_config.json +++ b/tests/inputs/test_tacotron2_config.json @@ -61,8 +61,8 @@ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. // TRAINING - "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":1, + "batch_size": 8, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size": 8, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 4], [1, 5, 2]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json index b60ed35e..90e07fc7 100644 --- a/tests/inputs/test_tacotron_config.json +++ b/tests/inputs/test_tacotron_config.json @@ -61,8 +61,8 @@ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. // TRAINING - "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":1, + "batch_size": 8, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size": 8, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 4], [1, 5, 2]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 384234e5..81d2ebbd 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -7,7 +7,7 @@ from trainer.logging.tensorboard_logger import TensorboardLogger from tests import assertHasAttr, assertHasNotAttr, get_tests_data_path, get_tests_input_path, get_tests_output_path from TTS.config import load_config -from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model +from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.models.vits import Vits, VitsArgs, amp_to_db, db_to_amp, load_audio, spec_to_mel, wav_to_mel, wav_to_spec from TTS.tts.utils.speakers import SpeakerManager From 24b57f6a0e45b2b6b502f8bb679ff506494c2e47 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 16 Mar 2022 11:51:37 +0100 Subject: [PATCH 12/38] Fix typo workflow text (#1403) --- .github/workflows/text_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/text_tests.yml b/.github/workflows/text_tests.yml index e06a25ad..66197e0b 100644 --- a/.github/workflows/text_tests.yml +++ b/.github/workflows/text_tests.yml @@ -1,4 +1,4 @@ -name: tts-tests +name: text-tests on: push: From f40b833659fa7ab7b99dfdfe54314674edc949c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 16 Mar 2022 12:05:17 +0100 Subject: [PATCH 13/38] Add CITATION.cff (#1404) --- CITATION.cff | 20 ++++++++++++++++++++ MANIFEST.in | 1 + 2 files changed, 21 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..6b0c8f19 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,20 @@ +cff-version: 1.2.0 +message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)" +title: "Coqui TTS" +abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production" +date-released: 2021-01-01 +authors: + - family-names: "Eren" + given-names: "Gölge" + - name: "The Coqui TTS Team" +version: 1.4 +doi: 10.5281/zenodo.6334862 +license: "MPL-2.0" +url: "https://www.coqui.ai" +repository-code: "https://github.com/coqui-ai/TTS" +keywords: + - machine learning + - deep learning + - artificial intelligence + - text to speech + - TTS \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 0d8b4b4c..82ecadcb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include README.md include LICENSE.txt include requirements.*.txt +include *.cff include requirements.txt include TTS/VERSION recursive-include TTS *.json From 690c96ed28fabafe587d0da63f1fbb5037a27083 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 16 Mar 2022 12:13:22 +0100 Subject: [PATCH 14/38] Fix default phonemizer for ja and zh (#1399) --- TTS/tts/utils/text/phonemizers/__init__.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py index 5dc117c4..90a526a7 100644 --- a/TTS/tts/utils/text/phonemizers/__init__.py +++ b/TTS/tts/utils/text/phonemizers/__init__.py @@ -12,16 +12,9 @@ GRUUT_LANGS = list(Gruut.supported_languages()) # Dict setting default phonemizers for each language -DEF_LANG_TO_PHONEMIZER = { - "ja-jp": JA_JP_Phonemizer.name(), - "zh-cn": ZH_CN_Phonemizer.name(), -} - - # Add Gruut languages _ = [Gruut.name()] * len(GRUUT_LANGS) -_new_dict = dict(list(zip(GRUUT_LANGS, _))) -DEF_LANG_TO_PHONEMIZER.update(_new_dict) +DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _))) # Add ESpeak languages and override any existing ones @@ -29,8 +22,10 @@ _ = [ESpeak.name()] * len(ESPEAK_LANGS) _new_dict = dict(list(zip(list(ESPEAK_LANGS), _))) DEF_LANG_TO_PHONEMIZER.update(_new_dict) +# Force default for some languages DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] - +DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name() +DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: """Initiate a phonemizer by name From 0870a4faa2fd36e95aafb9fcdf6b31d43b6fa6d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 16 Mar 2022 12:13:55 +0100 Subject: [PATCH 15/38] Make style (#1405) --- TTS/bin/distribute.py | 2 +- TTS/bin/eval_encoder.py | 9 +-- TTS/bin/synthesize.py | 17 ++++- TTS/bin/train_encoder.py | 57 ++++++++++----- TTS/encoder/configs/base_encoder_config.py | 5 +- TTS/encoder/dataset.py | 8 +-- TTS/encoder/losses.py | 1 + TTS/encoder/models/base_encoder.py | 69 +++++++++++-------- TTS/encoder/models/resnet.py | 1 + TTS/encoder/utils/samplers.py | 20 ++++-- TTS/tts/models/base_tts.py | 6 +- TTS/tts/models/vits.py | 13 +++- TTS/tts/utils/speakers.py | 4 +- TTS/tts/utils/synthesis.py | 8 +-- TTS/utils/synthesizer.py | 42 ++++++----- recipes/ljspeech/hifigan/train_hifigan.py | 7 +- .../train_multiband_melgan.py | 7 +- .../tacotron2-DDC/train_tacotron_ddc.py | 7 +- recipes/ljspeech/univnet/train.py | 7 +- .../multilingual/vits_tts/train_vits_tts.py | 12 +--- tests/data_tests/test_samplers.py | 21 +++--- 21 files changed, 184 insertions(+), 139 deletions(-) diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py index 97e2f0e3..b5552e32 100644 --- a/TTS/bin/distribute.py +++ b/TTS/bin/distribute.py @@ -35,7 +35,7 @@ def main(): command += unargs command.append("") - # run processes + # run a processes per GPU processes = [] for i in range(num_gpus): my_env = os.environ.copy() diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py index a03bfd82..de9e5865 100644 --- a/TTS/bin/eval_encoder.py +++ b/TTS/bin/eval_encoder.py @@ -1,17 +1,18 @@ import argparse -import torch from argparse import RawTextHelpFormatter +import torch from tqdm import tqdm from TTS.config import load_config from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.speakers import SpeakerManager + def compute_encoder_accuracy(dataset_items, encoder_manager): class_name_key = encoder_manager.speaker_encoder_config.class_name_key - map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, 'map_classid_to_classname', None) + map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, "map_classid_to_classname", None) class_acc_dict = {} @@ -43,11 +44,11 @@ def compute_encoder_accuracy(dataset_items, encoder_manager): acc_avg = 0 for key, values in class_acc_dict.items(): - acc = sum(values)/len(values) + acc = sum(values) / len(values) print("Class", key, "Accuracy:", acc) acc_avg += acc - print("Average Accuracy:", acc_avg/len(class_acc_dict)) + print("Average Accuracy:", acc_avg / len(class_acc_dict)) if __name__ == "__main__": diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index fe31c510..8b3f53db 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -210,7 +210,13 @@ If you don't specify any models, then it uses LJSpeech based English model. args = parser.parse_args() # print the description if either text or list_models is not set - if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav: + if ( + not args.text + and not args.list_models + and not args.list_speaker_idxs + and not args.list_language_idxs + and not args.reference_wav + ): parser.parse_args(["-h"]) # load model manager @@ -296,7 +302,14 @@ If you don't specify any models, then it uses LJSpeech based English model. print(" > Text: {}".format(args.text)) # kick it - wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx) + wav = synthesizer.tts( + args.text, + args.speaker_idx, + args.language_idx, + args.speaker_wav, + reference_wav=args.reference_wav, + reference_speaker_name=args.reference_speaker_idx, + ) # save the results print(" > Saving output to {}".format(args.out_path)) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index af3e6ec4..b8d38bac 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -9,6 +9,7 @@ import traceback import torch from torch.utils.data import DataLoader from trainer.torch import NoamLR +from trainer.trainer_utils import get_optimizer from TTS.encoder.dataset import EncoderDataset from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_speaker_encoder_model @@ -19,7 +20,6 @@ from TTS.tts.datasets import load_tts_samples from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import count_parameters, remove_experiment_folder from TTS.utils.io import copy_model_files -from trainer.trainer_utils import get_optimizer from TTS.utils.training import check_update torch.backends.cudnn.enabled = True @@ -52,16 +52,21 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False sampler = PerfectBatchSampler( dataset.items, classes, - batch_size=num_classes_in_batch*num_utter_per_class, # total batch size + batch_size=num_classes_in_batch * num_utter_per_class, # total batch size num_classes_in_batch=num_classes_in_batch, num_gpus=1, shuffle=not is_val, - drop_last=True) + drop_last=True, + ) if len(classes) < num_classes_in_batch: if is_val: - raise RuntimeError(f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !") - raise RuntimeError(f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !") + raise RuntimeError( + f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !" + ) + raise RuntimeError( + f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !" + ) # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal if is_val: @@ -76,6 +81,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False return loader, classes, dataset.get_map_classid_to_classname() + def evaluation(model, criterion, data_loader, global_step): eval_loss = 0 for _, data in enumerate(data_loader): @@ -84,8 +90,12 @@ def evaluation(model, criterion, data_loader, global_step): inputs, labels = data # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] - labels = torch.transpose(labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1).reshape(labels.shape) - inputs = torch.transpose(inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) + labels = torch.transpose( + labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1 + ).reshape(labels.shape) + inputs = torch.transpose( + inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1 + ).reshape(inputs.shape) # dispatch data to GPU if use_cuda: @@ -96,20 +106,23 @@ def evaluation(model, criterion, data_loader, global_step): outputs = model(inputs) # loss computation - loss = criterion(outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels) + loss = criterion( + outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels + ) eval_loss += loss.item() - eval_avg_loss = eval_loss/len(data_loader) + eval_avg_loss = eval_loss / len(data_loader) # save stats dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss}) # plot the last batch in the evaluation figures = { - "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), + "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), } dashboard_logger.eval_figures(global_step, figures) return eval_avg_loss + def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step): model.train() best_loss = float("inf") @@ -124,8 +137,12 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, # setup input data inputs, labels = data # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] - labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape) - inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) + labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape( + labels.shape + ) + inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape( + inputs.shape + ) # ToDo: move it to a unit test # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape) # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) @@ -157,7 +174,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, outputs = model(inputs) # loss computation - loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels) + loss = criterion( + outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels + ) loss.backward() grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() @@ -211,7 +230,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, print( ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} " "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format( - epoch, tot_loss/len(data_loader), grad_norm, epoch_time, avg_loader_time + epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time ), flush=True, ) @@ -222,10 +241,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, print("\n\n") print("--> EVAL PERFORMANCE") print( - " | > Epoch:{} AvgLoss: {:.5f} ".format( - epoch, eval_loss - ), - flush=True, + " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss), + flush=True, ) # save the best checkpoint best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch) @@ -262,7 +279,9 @@ def main(args): # pylint: disable=redefined-outer-name copy_model_files(c, OUT_PATH) if args.restore_path: - criterion, args.restore_step = model.load_checkpoint(c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion) + criterion, args.restore_step = model.load_checkpoint( + c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion + ) print(" > Model restored from step %d" % args.restore_step, flush=True) else: args.restore_step = 0 diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py index 02b88d66..ebbaa045 100644 --- a/TTS/encoder/configs/base_encoder_config.py +++ b/TTS/encoder/configs/base_encoder_config.py @@ -33,10 +33,7 @@ class BaseEncoderConfig(BaseTrainingConfig): grad_clip: float = 3.0 lr: float = 0.0001 optimizer: str = "radam" - optimizer_params: Dict = field(default_factory=lambda: { - "betas": [0.9, 0.999], - "weight_decay": 0 - }) + optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) lr_decay: bool = False warmup_steps: int = 4000 diff --git a/TTS/encoder/dataset.py b/TTS/encoder/dataset.py index a4db021b..582b1fe9 100644 --- a/TTS/encoder/dataset.py +++ b/TTS/encoder/dataset.py @@ -5,6 +5,7 @@ from torch.utils.data import Dataset from TTS.encoder.utils.generic_utils import AugmentWAV + class EncoderDataset(Dataset): def __init__( self, @@ -57,7 +58,6 @@ class EncoderDataset(Dataset): print(f" | > Num Classes: {len(self.classes)}") print(f" | > Classes: {self.classes}") - def load_wav(self, filename): audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) return audio @@ -75,9 +75,7 @@ class EncoderDataset(Dataset): ] # skip classes with number of samples >= self.num_utter_per_class - class_to_utters = { - k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class - } + class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class} classes = list(class_to_utters.keys()) classes.sort() @@ -105,11 +103,11 @@ class EncoderDataset(Dataset): def get_class_list(self): return self.classes + def set_classes(self, classes): self.classes = classes self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} - def get_map_classid_to_classname(self): return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items()) diff --git a/TTS/encoder/losses.py b/TTS/encoder/losses.py index de65d8d6..5b5aa0fc 100644 --- a/TTS/encoder/losses.py +++ b/TTS/encoder/losses.py @@ -195,6 +195,7 @@ class SoftmaxLoss(nn.Module): class_id = torch.argmax(activations) return class_id + class SoftmaxAngleProtoLoss(nn.Module): """ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153 diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py index c35c636d..ac7d7dd5 100644 --- a/TTS/encoder/models/base_encoder.py +++ b/TTS/encoder/models/base_encoder.py @@ -1,12 +1,13 @@ +import numpy as np import torch import torchaudio -import numpy as np +from coqpit import Coqpit from torch import nn -from TTS.utils.io import load_fsspec from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss from TTS.utils.generic_utils import set_init_dict -from coqpit import Coqpit +from TTS.utils.io import load_fsspec + class PreEmphasis(nn.Module): def __init__(self, coefficient=0.97): @@ -20,6 +21,7 @@ class PreEmphasis(nn.Module): x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") return torch.nn.functional.conv1d(x, self.filter).squeeze(1) + class BaseEncoder(nn.Module): """Base `encoder` class. Every new `encoder` model must inherit this. @@ -32,31 +34,31 @@ class BaseEncoder(nn.Module): def get_torch_mel_spectrogram_class(self, audio_config): return torch.nn.Sequential( - PreEmphasis(audio_config["preemphasis"]), - # TorchSTFT( - # n_fft=audio_config["fft_size"], - # hop_length=audio_config["hop_length"], - # win_length=audio_config["win_length"], - # sample_rate=audio_config["sample_rate"], - # window="hamming_window", - # mel_fmin=0.0, - # mel_fmax=None, - # use_htk=True, - # do_amp_to_db=False, - # n_mels=audio_config["num_mels"], - # power=2.0, - # use_mel=True, - # mel_norm=None, - # ) - torchaudio.transforms.MelSpectrogram( - sample_rate=audio_config["sample_rate"], - n_fft=audio_config["fft_size"], - win_length=audio_config["win_length"], - hop_length=audio_config["hop_length"], - window_fn=torch.hamming_window, - n_mels=audio_config["num_mels"], - ) - ) + PreEmphasis(audio_config["preemphasis"]), + # TorchSTFT( + # n_fft=audio_config["fft_size"], + # hop_length=audio_config["hop_length"], + # win_length=audio_config["win_length"], + # sample_rate=audio_config["sample_rate"], + # window="hamming_window", + # mel_fmin=0.0, + # mel_fmax=None, + # use_htk=True, + # do_amp_to_db=False, + # n_mels=audio_config["num_mels"], + # power=2.0, + # use_mel=True, + # mel_norm=None, + # ) + torchaudio.transforms.MelSpectrogram( + sample_rate=audio_config["sample_rate"], + n_fft=audio_config["fft_size"], + win_length=audio_config["win_length"], + hop_length=audio_config["hop_length"], + window_fn=torch.hamming_window, + n_mels=audio_config["num_mels"], + ), + ) @torch.no_grad() def inference(self, x, l2_norm=True): @@ -104,7 +106,9 @@ class BaseEncoder(nn.Module): raise Exception("The %s not is a loss supported" % c.loss) return criterion - def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None): + def load_checkpoint( + self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None + ): state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) try: self.load_state_dict(state["model"]) @@ -127,7 +131,12 @@ class BaseEncoder(nn.Module): print(" > Criterion load ignored because of:", error) # instance and load the criterion for the encoder classifier in inference time - if eval and criterion is None and "criterion" in state and getattr(config, 'map_classid_to_classname', None) is not None: + if ( + eval + and criterion is None + and "criterion" in state + and getattr(config, "map_classid_to_classname", None) is not None + ): criterion = self.get_criterion(config, len(config.map_classid_to_classname)) criterion.load_state_dict(state["criterion"]) diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py index c4ba9537..84e9967f 100644 --- a/TTS/encoder/models/resnet.py +++ b/TTS/encoder/models/resnet.py @@ -4,6 +4,7 @@ from torch import nn # from TTS.utils.audio import TorchSTFT from TTS.encoder.models.base_encoder import BaseEncoder + class SELayer(nn.Module): def __init__(self, channel, reduction=8): super(SELayer, self).__init__() diff --git a/TTS/encoder/utils/samplers.py b/TTS/encoder/utils/samplers.py index 947f5da0..08256b34 100644 --- a/TTS/encoder/utils/samplers.py +++ b/TTS/encoder/utils/samplers.py @@ -1,4 +1,5 @@ import random + from torch.utils.data.sampler import Sampler, SubsetRandomSampler @@ -34,10 +35,21 @@ class PerfectBatchSampler(Sampler): drop_last (bool): if True, drops last incomplete batch. """ - def __init__(self, dataset_items, classes, batch_size, num_classes_in_batch, num_gpus=1, shuffle=True, drop_last=False, label_key="class_name"): + def __init__( + self, + dataset_items, + classes, + batch_size, + num_classes_in_batch, + num_gpus=1, + shuffle=True, + drop_last=False, + label_key="class_name", + ): super().__init__(dataset_items) - assert batch_size % (num_classes_in_batch * num_gpus) == 0, ( - 'Batch size must be divisible by number of classes times the number of data parallel devices (if enabled).') + assert ( + batch_size % (num_classes_in_batch * num_gpus) == 0 + ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." label_indices = {} for idx, item in enumerate(dataset_items): @@ -93,7 +105,7 @@ class PerfectBatchSampler(Sampler): if groups % self._dp_devices == 0: yield batch else: - batch = batch[:(groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch] + batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch] if len(batch) > 0: yield batch diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 222f8519..945c031f 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -7,15 +7,15 @@ import torch.distributed as dist from coqpit import Coqpit from torch import nn from torch.utils.data import DataLoader +from torch.utils.data.sampler import WeightedRandomSampler from trainer.torch import DistributedSampler, DistributedSamplerWrapper from TTS.model import BaseTrainerModel from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_balancer_weights +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment, plot_spectrogram -from torch.utils.data.sampler import WeightedRandomSampler # pylint: skip-file @@ -258,7 +258,7 @@ class BaseTTS(BaseTrainerModel): # sampler for DDP if sampler is None: sampler = DistributedSampler(dataset) if num_gpus > 1 else None - else: # If a sampler is already defined use this sampler and DDP sampler together + else: # If a sampler is already defined use this sampler and DDP sampler together sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler return sampler diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 818b9a54..afadbadd 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -994,8 +994,11 @@ class Vits(BaseTTS): outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p} return outputs + @torch.no_grad() - def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None): + def inference_voice_conversion( + self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None + ): """Inference for voice conversion Args: @@ -1006,7 +1009,13 @@ class Vits(BaseTTS): reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]` """ # compute spectrograms - y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2) + y = wav_to_spec( + reference_wav, + self.config.audio.fft_size, + self.config.audio.hop_length, + self.config.audio.win_length, + center=False, + ).transpose(1, 2) y_lengths = torch.tensor([y.size(-1)]).to(y.device) speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 1a5da94a..0227412d 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -269,7 +269,9 @@ class SpeakerManager: """ self.speaker_encoder_config = load_config(config_path) self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) - self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint(self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda) + self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint( + self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda + ) self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list: diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 582fb4f1..f9e13251 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -206,6 +206,7 @@ def synthesis( } return return_dict + def transfer_voice( model, CONFIG, @@ -269,12 +270,7 @@ def transfer_voice( _func = model.module.inference_voice_conversion else: _func = model.inference_voice_conversion - model_outputs = _func( - reference_wav, - speaker_id, - d_vector, - reference_speaker_id, - reference_d_vector) + model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector) # convert outputs to numpy # plot results diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 687794b4..2ea23adb 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -119,7 +119,7 @@ class Synthesizer(object): if use_cuda: self.tts_model.cuda() - if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): + if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) def _set_speaker_encoder_paths_from_tts_config(self): @@ -199,8 +199,8 @@ class Synthesizer(object): if not text and not reference_wav: raise ValueError( - "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." - ) + "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." + ) if text: sens = self.split_into_sentences(text) @@ -214,7 +214,9 @@ class Synthesizer(object): if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: # get the average speaker embedding from the saved d_vectors. - speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False) + speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector( + speaker_name, num_samples=None, randomize=False + ) speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name @@ -315,25 +317,31 @@ class Synthesizer(object): if reference_speaker_name and isinstance(reference_speaker_name, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. - reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(reference_speaker_name)[0] - reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :] # [1 x embedding_dim] + reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker( + reference_speaker_name + )[0] + reference_speaker_embedding = np.array(reference_speaker_embedding)[ + None, : + ] # [1 x embedding_dim] else: # get speaker idx from the speaker name reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name] else: - reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(reference_wav) + reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip( + reference_wav + ) outputs = transfer_voice( - model=self.tts_model, - CONFIG=self.tts_config, - use_cuda=self.use_cuda, - reference_wav=reference_wav, - speaker_id=speaker_id, - d_vector=speaker_embedding, - use_griffin_lim=use_gl, - reference_speaker_id=reference_speaker_id, - reference_d_vector=reference_speaker_embedding - ) + model=self.tts_model, + CONFIG=self.tts_config, + use_cuda=self.use_cuda, + reference_wav=reference_wav, + speaker_id=speaker_id, + d_vector=speaker_embedding, + use_griffin_lim=use_gl, + reference_speaker_id=reference_speaker_id, + reference_d_vector=reference_speaker_embedding, + ) waveform = outputs if not use_gl: mel_postnet_spec = outputs[0].detach().cpu().numpy() diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py index 6a739009..b4cbae63 100644 --- a/recipes/ljspeech/hifigan/train_hifigan.py +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -41,11 +41,6 @@ model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py index d5ca9a76..225f5a30 100644 --- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -41,11 +41,6 @@ model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py index a0ff8b02..04e6150e 100644 --- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py @@ -84,11 +84,6 @@ model = Tacotron2(config, ap, tokenizer, speaker_manager=None) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py index 592b9a76..81d2b889 100644 --- a/recipes/ljspeech/univnet/train.py +++ b/recipes/ljspeech/univnet/train.py @@ -40,11 +40,6 @@ model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index c4ed0dda..26eb46be 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -6,12 +6,11 @@ from trainer import Trainer, TrainerArgs from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig -from TTS.tts.models.vits import CharactersConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.models.vits import Vits, VitsArgs +from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs from TTS.tts.utils.languages import LanguageManager -from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -131,11 +130,6 @@ model = Vits(config, ap, tokenizer, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index c888c629..42f1bfd5 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -1,14 +1,13 @@ import functools - import unittest import torch from TTS.config.shared_configs import BaseDatasetConfig +from TTS.encoder.utils.samplers import PerfectBatchSampler from TTS.tts.datasets import load_tts_samples from TTS.tts.utils.languages import get_language_balancer_weights from TTS.tts.utils.speakers import get_speaker_balancer_weights -from TTS.encoder.utils.samplers import PerfectBatchSampler # Fixing random state to avoid random fails torch.manual_seed(0) @@ -60,7 +59,9 @@ class TestSamplers(unittest.TestCase): assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" def test_language_weighted_random_sampler(self): # pylint: disable=no-self-use - weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_language_balancer_weights(train_samples), len(train_samples)) + weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler( + get_language_balancer_weights(train_samples), len(train_samples) + ) ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) en, pt = 0, 0 for index in ids: @@ -73,7 +74,9 @@ class TestSamplers(unittest.TestCase): def test_speaker_weighted_random_sampler(self): # pylint: disable=no-self-use - weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_speaker_balancer_weights(train_samples), len(train_samples)) + weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler( + get_speaker_balancer_weights(train_samples), len(train_samples) + ) ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) spk1, spk2 = 0, 0 for index in ids: @@ -92,11 +95,12 @@ class TestSamplers(unittest.TestCase): sampler = PerfectBatchSampler( train_samples, classes, - batch_size=2 * 3, # total batch size + batch_size=2 * 3, # total batch size num_classes_in_batch=2, label_key="speaker_name", shuffle=False, - drop_last=True) + drop_last=True, + ) batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)]) for batch in batchs: spk1, spk2 = 0, 0 @@ -116,11 +120,12 @@ class TestSamplers(unittest.TestCase): sampler = PerfectBatchSampler( train_samples, classes, - batch_size=2 * 3, # total batch size + batch_size=2 * 3, # total batch size num_classes_in_batch=2, label_key="speaker_name", shuffle=True, - drop_last=False) + drop_last=False, + ) batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)]) for batch in batchs: spk1, spk2 = 0, 0 From fd56fabb21db87059c27c0d772e6948ffc129a14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 16 Mar 2022 12:38:27 +0100 Subject: [PATCH 16/38] Fix #1380 (#1409) --- TTS/tts/datasets/formatters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index 3e963d0c..c13fcdb8 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -246,7 +246,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None): continue items.append({"text": text, "audio_file": wav_file, "speaker_name": f"LTTS_{speaker_name}"}) for item in items: - assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" + assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}" return items From c7f9ec07c86031126e6eddcbbb45bd906d0425e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Mar 2022 16:47:50 +0100 Subject: [PATCH 17/38] Hinge Gruut version to 2.2.3 (#1419) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e3871874..c3599220 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,6 +33,6 @@ pypinyin mecab-python3==1.0.3 unidic-lite==1.0.8 # gruut+supported langs -gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 +gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3 # others webrtcvad # for VAD From 2e6e8f651d1a8330f8bf6e5b19307d838f0708e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Mar 2022 16:48:24 +0100 Subject: [PATCH 18/38] Update CheckSpectrograms notebook (#1418) --- .../dataset_analysis/CheckSpectrograms.ipynb | 222 ++++++++++-------- 1 file changed, 126 insertions(+), 96 deletions(-) diff --git a/notebooks/dataset_analysis/CheckSpectrograms.ipynb b/notebooks/dataset_analysis/CheckSpectrograms.ipynb index 74ca51ab..47e5c4cf 100644 --- a/notebooks/dataset_analysis/CheckSpectrograms.ipynb +++ b/notebooks/dataset_analysis/CheckSpectrograms.ipynb @@ -3,6 +3,10 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "%matplotlib inline\n", "\n", @@ -12,21 +16,51 @@ "\n", "import IPython.display as ipd\n", "import glob" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ - "config_path = \"/home/erogol/gdrive/Projects/TTS/recipes/ljspeech/align_tts/config_transformer2.json\"\n", - "data_path = \"/home/erogol/gdrive/Datasets/LJSpeech-1.1/\"\n", - "\n", - "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n", - "CONFIG = load_config(config_path)\n", + "from TTS.config.shared_configs import BaseAudioConfig\n", + "CONFIG = BaseAudioConfig()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ✍️ Set these values " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_path = \"/root/wav48_silence_trimmed/\"\n", + "file_ext = \".flac\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read audio files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_paths = glob.glob(data_path + f\"/**/*{file_ext}\", recursive=True)\n", "\n", "# Change this to the index of the desired file listed below\n", "sample_file_index = 10\n", @@ -35,44 +69,45 @@ "\n", "print(\"File list, by index:\")\n", "dict(enumerate(file_paths))" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, "source": [ - "### Setup Audio Processor\n", + "## ✍️ Set Audio Processor\n", "Play with the AP parameters until you find a good fit with the synthesis speech below.\n", "\n", "The default values are loaded from your config.json file, so you only need to\n", "uncomment and modify values below that you'd like to tune." - ], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "tune_params={\n", - "# 'audio_processor': 'audio',\n", - "# 'num_mels': 80, # In general, you don't need to change this. \n", - "# 'fft_size': 1024, # In general, you don't need to change this.\n", - "# 'sample_rate': 22050, # This must match the sample rate of the dataset.\n", - "# 'hop_length': 256, # In general, you don't need to change this.\n", - "# 'win_length': 1024, # In general, you don't need to change this.\n", - "# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n", - "# 'min_level_db': -100,\n", - "# 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n", - "# 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n", - "# 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n", - "# 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", - "# 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", - "# 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", + " 'num_mels': 80, # In general, you don't need to change this. \n", + " 'fft_size': 2400, # In general, you don't need to change this.\n", + " 'frame_length_ms': 50, \n", + " 'frame_shift_ms': 12.5,\n", + " 'sample_rate': 48000, # This must match the sample rate of the dataset.\n", + " 'hop_length': None, # In general, you don't need to change this.\n", + " 'win_length': 1024, # In general, you don't need to change this.\n", + " 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n", + " 'min_level_db': -100,\n", + " 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n", + " 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n", + " 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n", + " 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", + " 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", + " 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", "}\n", "\n", "# These options have to be forced off in order to avoid errors about the \n", @@ -86,59 +121,57 @@ "}\n", "\n", "# Override select parts of loaded config with parameters above\n", - "tuned_config = CONFIG.audio.copy()\n", + "tuned_config = CONFIG.copy()\n", "tuned_config.update(reset)\n", "tuned_config.update(tune_params)\n", "\n", "AP = AudioProcessor(**tuned_config);" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", - "source": [ - "### Check audio loading " - ], "metadata": { "Collapsed": "false" - } + }, + "source": [ + "### Check audio loading " + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "wav = AP.load_wav(SAMPLE_FILE_PATH)\n", "ipd.Audio(data=wav, rate=AP.sample_rate) " - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", - "source": [ - "### Generate Mel-Spectrogram and Re-synthesis with GL" - ], "metadata": { "Collapsed": "false" - } + }, + "source": [ + "### Generate Mel-Spectrogram and Re-synthesis with GL" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "AP.power = 1.5" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "mel = AP.melspectrogram(wav)\n", "print(\"Max:\", mel.max())\n", @@ -148,24 +181,24 @@ "\n", "wav_gen = AP.inv_melspectrogram(mel)\n", "ipd.Audio(wav_gen, rate=AP.sample_rate)" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", - "source": [ - "### Generate Linear-Spectrogram and Re-synthesis with GL" - ], "metadata": { "Collapsed": "false" - } + }, + "source": [ + "### Generate Linear-Spectrogram and Re-synthesis with GL" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "spec = AP.spectrogram(wav)\n", "print(\"Max:\", spec.max())\n", @@ -175,26 +208,26 @@ "\n", "wav_gen = AP.inv_spectrogram(spec)\n", "ipd.Audio(wav_gen, rate=AP.sample_rate)" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, "source": [ "### Compare values for a certain parameter\n", "\n", "Optimize your parameters by comparing different values per parameter at a time." - ], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "from librosa import display\n", "from matplotlib import pylab as plt\n", @@ -234,39 +267,39 @@ " val = values[idx]\n", " print(\" > {} = {}\".format(attribute, val))\n", " IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])" - ], - "outputs": [], "metadata": { "Collapsed": "false" - } + }, + "outputs": [], + "source": [ + "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])" + ] } ], "metadata": { + "interpreter": { + "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0" + }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.5 64-bit ('torch': conda)" + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -278,12 +311,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" - }, - "interpreter": { - "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0" + "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} From ccdc2300dc91ced60f93808eae56aef15e92cd96 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Tue, 22 Mar 2022 08:54:41 -0300 Subject: [PATCH 19/38] Add eval_split and eval_split_size in the call of load_tts_samples for all recipes (#1424) --- recipes/ljspeech/align_tts/train_aligntts.py | 2 +- recipes/ljspeech/fast_pitch/train_fast_pitch.py | 2 +- recipes/ljspeech/fast_speech/train_fast_speech.py | 2 +- recipes/ljspeech/glow_tts/train_glowtts.py | 2 +- recipes/ljspeech/speedy_speech/train_speedy_speech.py | 2 +- recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py | 2 +- recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py | 2 +- recipes/ljspeech/vits_tts/train_vits.py | 2 +- recipes/multilingual/vits_tts/train_vits_tts.py | 2 +- recipes/vctk/fast_pitch/train_fast_pitch.py | 2 +- recipes/vctk/fast_speech/train_fast_speech.py | 2 +- recipes/vctk/glow_tts/train_glow_tts.py | 2 +- recipes/vctk/speedy_speech/train_speedy_speech.py | 2 +- recipes/vctk/tacotron-DDC/train_tacotron-DDC.py | 2 +- recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py | 2 +- recipes/vctk/tacotron2/train_tacotron2.py | 2 +- recipes/vctk/vits/train_vits.py | 2 +- 17 files changed, 17 insertions(+), 17 deletions(-) diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py index f1b29025..d27d0fa1 100644 --- a/recipes/ljspeech/align_tts/train_aligntts.py +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -49,7 +49,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init model model = AlignTTS(config, ap, tokenizer) diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index a3fc35c9..1f10ef07 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -84,7 +84,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init the model model = ForwardTTS(config, ap, tokenizer, speaker_manager=None) diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py index 560d3de2..e5a601a7 100644 --- a/recipes/ljspeech/fast_speech/train_fast_speech.py +++ b/recipes/ljspeech/fast_speech/train_fast_speech.py @@ -83,7 +83,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init the model model = ForwardTTS(config, ap, tokenizer) diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py index c47cd00a..47d03fe3 100644 --- a/recipes/ljspeech/glow_tts/train_glowtts.py +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -60,7 +60,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # INITIALIZE THE MODEL # Models take a config object and a speaker manager as input diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py index 7ad132b2..a19e9053 100644 --- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py +++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py @@ -67,7 +67,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init model model = ForwardTTS(config, ap, tokenizer) diff --git a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py index ea1b0874..19a9f315 100644 --- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py +++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py @@ -77,7 +77,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # INITIALIZE THE MODEL # Models take a config object and a speaker manager as input diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py index 04e6150e..029698d8 100644 --- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py @@ -74,7 +74,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # INITIALIZE THE MODEL # Models take a config object and a speaker manager as input diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py index cfb3351d..e38dc200 100644 --- a/recipes/ljspeech/vits_tts/train_vits.py +++ b/recipes/ljspeech/vits_tts/train_vits.py @@ -69,7 +69,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init model model = Vits(config, ap, tokenizer, speaker_manager=None) diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index 26eb46be..9e0cb4c8 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -109,7 +109,7 @@ config.from_dict(config.to_dict()) ap = AudioProcessor(**config.audio.to_dict()) # load training samples -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py index 986202c5..d066a539 100644 --- a/recipes/vctk/fast_pitch/train_fast_pitch.py +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -71,7 +71,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py index fe785a41..dbe23351 100644 --- a/recipes/vctk/fast_speech/train_fast_speech.py +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -69,7 +69,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py index ebdbfb37..8a891e5d 100644 --- a/recipes/vctk/glow_tts/train_glow_tts.py +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -69,7 +69,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py index 80d21ca2..d9353af2 100644 --- a/recipes/vctk/speedy_speech/train_speedy_speech.py +++ b/recipes/vctk/speedy_speech/train_speedy_speech.py @@ -69,7 +69,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py index bed21ad9..14007239 100644 --- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py +++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py @@ -72,7 +72,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py index caa745b3..ab2e1bc9 100644 --- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -78,7 +78,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py index 43f5d4e6..48934e2a 100644 --- a/recipes/vctk/tacotron2/train_tacotron2.py +++ b/recipes/vctk/tacotron2/train_tacotron2.py @@ -78,7 +78,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index 84e8a058..443dbbd1 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -79,7 +79,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader From 72d85e53c98b908345bbff70f7cfba2174e883ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 22 Mar 2022 17:55:00 +0100 Subject: [PATCH 20/38] Update model file extension (#1422) * Update model file ext to ```.pth``` * Update docs * Rename more * Find model files --- .gitignore | 1 + README.md | 8 +-- TTS/bin/compute_attention_masks.py | 2 +- TTS/bin/compute_embeddings.py | 2 +- TTS/bin/distribute.py | 55 ------------------- TTS/bin/eval_encoder.py | 2 +- TTS/bin/synthesize.py | 8 +-- TTS/encoder/README.md | 2 +- TTS/encoder/utils/generic_utils.py | 4 +- TTS/encoder/utils/io.py | 4 +- TTS/server/README.md | 2 +- TTS/server/conf.json | 2 +- TTS/utils/audio.py | 2 +- TTS/utils/generic_utils.py | 2 +- TTS/utils/io.py | 8 +-- TTS/utils/manage.py | 31 ++++++++++- TTS/vocoder/README.md | 2 +- docs/source/finetuning.md | 6 +- docs/source/inference.md | 6 +- docs/source/training_a_model.md | 4 +- docs/source/tutorial_for_nervous_beginners.md | 4 +- notebooks/ExtractTTSpectrogram.ipynb | 2 +- notebooks/PlotUmapLibriTTS.ipynb | 2 +- notebooks/TestAttention.ipynb | 2 +- .../dataset_analysis/AnalyzeDataset.ipynb | 2 +- .../test_extract_tts_spectrograms.py | 6 +- tests/aux_tests/test_speaker_manager.py | 2 +- tests/inference_tests/test_synthesizer.py | 2 +- tests/inputs/server_config.json | 2 +- 29 files changed, 74 insertions(+), 103 deletions(-) delete mode 100644 TTS/bin/distribute.py diff --git a/.gitignore b/.gitignore index f8d6e644..2a3cbad4 100644 --- a/.gitignore +++ b/.gitignore @@ -115,6 +115,7 @@ venv.bak/ *.swo # pytorch models +*.pth *.pth.tar result/ diff --git a/README.md b/README.md index 80fa5dea..97a7cc66 100644 --- a/README.md +++ b/README.md @@ -159,13 +159,13 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht - Run your own TTS model (Using Griffin-Lim Vocoder): ``` - $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav + $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav ``` - Run your own TTS and Vocoder models: ``` - $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav - --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json + $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav + --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json ``` ### Multi-speaker Models @@ -185,7 +185,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht - Run your own multi-speaker TTS model: ``` - $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx + $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx ``` ## Directory Structure diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index e58259a6..9ab520be 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -25,7 +25,7 @@ These masks can be used for different purposes including training a TTS model wi """ Example run: CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py - --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar + --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json --dataset_metafile metadata.csv --data_path /root/LJSpeech-1.1/ diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 68571fb4..d7a2c5f6 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -12,7 +12,7 @@ parser = argparse.ArgumentParser( description="""Compute embedding vectors for each wav file in a dataset.\n\n""" """ Example runs: - python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json dataset_config.json embeddings_output_path/ + python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json embeddings_output_path/ """, formatter_class=RawTextHelpFormatter, ) diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py deleted file mode 100644 index b5552e32..00000000 --- a/TTS/bin/distribute.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import pathlib -import subprocess -import time - -import torch -from trainer import TrainerArgs - - -def main(): - """ - Call train.py as a new process and pass command arguments - """ - parser = TrainerArgs().init_argparse(arg_prefix="") - parser.add_argument("--script", type=str, help="Target training script to distibute.") - args, unargs = parser.parse_known_args() - - num_gpus = torch.cuda.device_count() - group_id = time.strftime("%Y_%m_%d-%H%M%S") - - # set arguments for train.py - folder_path = pathlib.Path(__file__).parent.absolute() - if os.path.exists(os.path.join(folder_path, args.script)): - command = [os.path.join(folder_path, args.script)] - else: - command = [args.script] - command.append("--continue_path={}".format(args.continue_path)) - command.append("--restore_path={}".format(args.restore_path)) - command.append("--config_path={}".format(args.config_path)) - command.append("--group_id=group_{}".format(group_id)) - command.append("--use_ddp=true") - command += unargs - command.append("") - - # run a processes per GPU - processes = [] - for i in range(num_gpus): - my_env = os.environ.copy() - my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) - command[-1] = "--rank={}".format(i) - # prevent stdout for processes with rank != 0 - stdout = None - p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with - processes.append(p) - print(command) - - for p in processes: - p.wait() - - -if __name__ == "__main__": - main() diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py index de9e5865..089f3645 100644 --- a/TTS/bin/eval_encoder.py +++ b/TTS/bin/eval_encoder.py @@ -56,7 +56,7 @@ if __name__ == "__main__": description="""Compute the accuracy of the encoder.\n\n""" """ Example runs: - python TTS/bin/eval_encoder.py emotion_encoder_model.pth.tar emotion_encoder_config.json dataset_config.json + python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json """, formatter_class=RawTextHelpFormatter, ) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 8b3f53db..eb166bc8 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -60,13 +60,13 @@ If you don't specify any models, then it uses LJSpeech based English model. - Run your own TTS model (Using Griffin-Lim Vocoder): ``` - $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav + $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav ``` - Run your own TTS and Vocoder models: ``` - $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav - --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json + $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav + --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json ``` ### Multi-speaker Models @@ -86,7 +86,7 @@ If you don't specify any models, then it uses LJSpeech based English model. - Run your own multi-speaker TTS model: ``` - $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx + $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx ``` """ # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep diff --git a/TTS/encoder/README.md b/TTS/encoder/README.md index b6f541f8..b38b2005 100644 --- a/TTS/encoder/README.md +++ b/TTS/encoder/README.md @@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS. - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` -- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. +- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. - Watch training on Tensorboard as in TTS diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 17f1c3d9..19c00582 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -147,7 +147,7 @@ def setup_speaker_encoder_model(config: "Coqpit"): def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): - checkpoint_path = "checkpoint_{}.pth.tar".format(current_step) + checkpoint_path = "checkpoint_{}.pth".format(current_step) checkpoint_path = os.path.join(out_path, checkpoint_path) print(" | | > Checkpoint saving : {}".format(checkpoint_path)) @@ -177,7 +177,7 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path "date": datetime.date.today().strftime("%B %d, %Y"), } best_loss = model_loss - bestmodel_path = "best_model.pth.tar" + bestmodel_path = "best_model.pth" bestmodel_path = os.path.join(out_path, bestmodel_path) print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) save_fsspec(state, bestmodel_path) diff --git a/TTS/encoder/utils/io.py b/TTS/encoder/utils/io.py index 7a3aadc9..d1dad3e2 100644 --- a/TTS/encoder/utils/io.py +++ b/TTS/encoder/utils/io.py @@ -5,7 +5,7 @@ from TTS.utils.io import save_fsspec def save_checkpoint(model, optimizer, model_loss, out_path, current_step): - checkpoint_path = "checkpoint_{}.pth.tar".format(current_step) + checkpoint_path = "checkpoint_{}.pth".format(current_step) checkpoint_path = os.path.join(out_path, checkpoint_path) print(" | | > Checkpoint saving : {}".format(checkpoint_path)) @@ -31,7 +31,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_s "date": datetime.date.today().strftime("%B %d, %Y"), } best_loss = model_loss - bestmodel_path = "best_model.pth.tar" + bestmodel_path = "best_model.pth" bestmodel_path = os.path.join(out_path, bestmodel_path) print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) save_fsspec(state, bestmodel_path) diff --git a/TTS/server/README.md b/TTS/server/README.md index 89ee21eb..5458e398 100644 --- a/TTS/server/README.md +++ b/TTS/server/README.md @@ -21,4 +21,4 @@ Run the server with the official models on a GPU. ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True``` Run the server with a custom models. -```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json``` +```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json``` diff --git a/TTS/server/conf.json b/TTS/server/conf.json index 32e475cf..49b6c09c 100644 --- a/TTS/server/conf.json +++ b/TTS/server/conf.json @@ -1,6 +1,6 @@ { "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder - "tts_file":"best_model.pth.tar", // tts checkpoint file + "tts_file":"best_model.pth", // tts checkpoint file "tts_config":"config.json", // tts config.json file "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. "vocoder_config":null, diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index d0777c11..3ed0a76a 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -371,7 +371,7 @@ class AudioProcessor(object): self.hop_length = hop_length self.win_length = win_length assert min_level_db != 0.0, " [!] min_level_db is 0" - assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size" + assert self.win_length <= self.fft_size, f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" members = vars(self) if verbose: print(" > Setting up Audio Processor...") diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 69609bcb..b685210c 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -67,7 +67,7 @@ def get_experiment_folder_path(root_path, model_name): def remove_experiment_folder(experiment_path): """Check folder if there is a checkpoint, otherwise remove the folder""" fs = fsspec.get_mapper(experiment_path).fs - checkpoint_files = fs.glob(experiment_path + "/*.pth.tar") + checkpoint_files = fs.glob(experiment_path + "/*.pth") if not checkpoint_files: if fs.exists(experiment_path): fs.rm(experiment_path, recursive=True) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 54818ce9..304df5ed 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -140,7 +140,7 @@ def save_checkpoint( output_folder, **kwargs, ): - file_name = "checkpoint_{}.pth.tar".format(current_step) + file_name = "checkpoint_{}.pth".format(current_step) checkpoint_path = os.path.join(output_folder, file_name) print("\n > CHECKPOINT : {}".format(checkpoint_path)) save_model( @@ -170,7 +170,7 @@ def save_best_model( **kwargs, ): if current_loss < best_loss: - best_model_name = f"best_model_{current_step}.pth.tar" + best_model_name = f"best_model_{current_step}.pth" checkpoint_path = os.path.join(out_path, best_model_name) print(" > BEST MODEL : {}".format(checkpoint_path)) save_model( @@ -187,12 +187,12 @@ def save_best_model( fs = fsspec.get_mapper(out_path).fs # only delete previous if current is saved successfully if not keep_all_best or (current_step < keep_after): - model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar")) + model_names = fs.glob(os.path.join(out_path, "best_model*.pth")) for model_name in model_names: if os.path.basename(model_name) != best_model_name: fs.rm(model_name) # create a shortcut which always points to the currently best model - shortcut_name = "best_model.pth.tar" + shortcut_name = "best_model.pth" shortcut_path = os.path.join(out_path, shortcut_name) fs.copy(checkpoint_path, shortcut_path) best_loss = current_loss diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 01d54ad6..dd397687 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -3,6 +3,7 @@ import json import os import zipfile from pathlib import Path +from typing import Tuple from shutil import copyfile, rmtree import requests @@ -114,7 +115,7 @@ class ModelManager(object): e.g. 'tts_model/en/ljspeech/tacotron' Every model must have the following files: - - *.pth.tar : pytorch model checkpoint file. + - *.pth : pytorch model checkpoint file. - config.json : model config file. - scale_stats.npy (if exist): scale values for preprocessing. @@ -127,7 +128,7 @@ class ModelManager(object): model_item = self.models_dict[model_type][lang][dataset][model] # set the model specific output path output_path = os.path.join(self.output_prefix, model_full_name) - output_model_path = os.path.join(output_path, "model_file.pth.tar") + output_model_path = os.path.join(output_path, "model_file.pth") output_config_path = os.path.join(output_path, "config.json") if os.path.exists(output_path): @@ -139,8 +140,32 @@ class ModelManager(object): self._download_zip_file(model_item["github_rls_url"], output_path) # update paths in the config.json self._update_paths(output_path, output_config_path) + # find downloaded files + output_model_path, output_config_path = self._find_files(output_path) return output_model_path, output_config_path, model_item + def _find_files(self, output_path:str) -> Tuple[str, str]: + """Find the model and config files in the output path + + Args: + output_path (str): path to the model files + + Returns: + Tuple[str, str]: path to the model file and config file + """ + model_file = None + config_file = None + for file_name in os.listdir(output_path): + if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]: + model_file = os.path.join(output_path, file_name) + elif file_name == "config.json": + config_file = os.path.join(output_path, file_name) + if model_file is None: + raise ValueError(" [!] Model file not found in the output path") + if config_file is None: + raise ValueError(" [!] Config file not found in the output path") + return model_file, config_file + def _update_paths(self, output_path: str, config_path: str) -> None: """Update paths for certain files in config.json after download. @@ -152,7 +177,7 @@ class ModelManager(object): output_d_vector_file_path = os.path.join(output_path, "speakers.json") output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json") speaker_encoder_config_path = os.path.join(output_path, "config_se.json") - speaker_encoder_model_path = os.path.join(output_path, "model_se.pth.tar") + speaker_encoder_model_path = os.path.join(output_path, "model_se.pth") # update the scale_path.npy file path in the model config.json self._update_path("audio.stats_path", output_stats_path, config_path) diff --git a/TTS/vocoder/README.md b/TTS/vocoder/README.md index e0ae8f21..b9fb17c8 100644 --- a/TTS/vocoder/README.md +++ b/TTS/vocoder/README.md @@ -29,7 +29,7 @@ You can continue a previous training run by the following command. You can fine-tune a pre-trained model by the following command. -```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar``` +```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth``` Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off. diff --git a/docs/source/finetuning.md b/docs/source/finetuning.md index 7d7ef1cb..fd97daa5 100644 --- a/docs/source/finetuning.md +++ b/docs/source/finetuning.md @@ -93,13 +93,13 @@ them and fine-tune it for your own dataset. This will help you in two main ways: ```bash CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \ - --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar + --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth ``` ```bash CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \ --config_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \ - --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar + --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth ``` As stated above, you can also use command-line arguments to change the model configuration. @@ -107,7 +107,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways: ```bash CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \ - --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar + --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth --coqpit.run_name "glow-tts-finetune" \ --coqpit.lr 0.00001 ``` diff --git a/docs/source/inference.md b/docs/source/inference.md index 544473bf..1057d04d 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -44,7 +44,7 @@ Run your own TTS model (Using Griffin-Lim Vocoder) ```bash tts --text "Text for TTS" \ - --model_path path/to/model.pth.tar \ + --model_path path/to/model.pth \ --config_path path/to/config.json \ --out_path folder/to/save/output.wav ``` @@ -54,9 +54,9 @@ Run your own TTS and Vocoder models ```bash tts --text "Text for TTS" \ --config_path path/to/config.json \ - --model_path path/to/model.pth.tar \ + --model_path path/to/model.pth \ --out_path folder/to/save/output.wav \ - --vocoder_path path/to/vocoder.pth.tar \ + --vocoder_path path/to/vocoder.pth \ --vocoder_config_path path/to/vocoder_config.json ``` diff --git a/docs/source/training_a_model.md b/docs/source/training_a_model.md index a28710d0..22090f6e 100644 --- a/docs/source/training_a_model.md +++ b/docs/source/training_a_model.md @@ -33,7 +33,7 @@ If you like to run a multi-gpu training using DDP back-end, ```bash - $ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script /train_glowtts.py + $ CUDA_VISIBLE_DEVICES="0, 1, 2" python -m trainer.distribute --script /train_glowtts.py ``` The example above runs a multi-gpu training using GPUs `0, 1, 2`. @@ -122,7 +122,7 @@ ```bash $ tts --text "Text for TTS" \ - --model_path path/to/checkpoint_x.pth.tar \ + --model_path path/to/checkpoint_x.pth \ --config_path path/to/config.json \ --out_path folder/to/save/output.wav ``` diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md index fa09cb7d..d2d3c4bb 100644 --- a/docs/source/tutorial_for_nervous_beginners.md +++ b/docs/source/tutorial_for_nervous_beginners.md @@ -50,13 +50,13 @@ A breakdown of a simple script that trains a GlowTTS model on the LJspeech datas - Fine-tune a model. ```bash - CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth.tar + CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth ``` - Run multi-gpu training. ```bash - CUDA_VISIBLE_DEVICES=0,1,2 python TTS/bin/distribute.py --script train.py + CUDA_VISIBLE_DEVICES=0,1,2 python -m trainer.distribute --script train.py ``` ### CLI Way diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index 50b60ff0..a257b6bf 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -66,7 +66,7 @@ "DATASET = \"ljspeech\"\n", "METADATA_FILE = \"metadata.csv\"\n", "CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n", - "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth.tar\"\n", + "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n", "BATCH_SIZE = 32\n", "\n", "QUANTIZED_WAV = False\n", diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/PlotUmapLibriTTS.ipynb index c809a5c4..1e29790b 100644 --- a/notebooks/PlotUmapLibriTTS.ipynb +++ b/notebooks/PlotUmapLibriTTS.ipynb @@ -66,7 +66,7 @@ "outputs": [], "source": [ "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", + "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n", "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", "\n", "# My single speaker locations\n", diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index 5d8eed85..b257ff70 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -73,7 +73,7 @@ "\n", "# Set constants\n", "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n", - "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", + "MODEL_PATH = ROOT_PATH + '/best_model.pth'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", "OUT_FOLDER = './hard_sentences/'\n", "CONFIG = load_config(CONFIG_PATH)\n", diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb index e08f3ab3..51963847 100644 --- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -416,7 +416,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.9.5" } }, "nbformat": 4, diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py index 8c795d58..ef751846 100644 --- a/tests/aux_tests/test_extract_tts_spectrograms.py +++ b/tests/aux_tests/test_extract_tts_spectrograms.py @@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): def test_GlowTTS(): # set paths config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json") - checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar") + checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) @@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): def test_Tacotron2(): # set paths config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar") + checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) @@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): def test_Tacotron(): # set paths config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar") + checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py index 5fafb56a..57ff6c50 100644 --- a/tests/aux_tests/test_speaker_manager.py +++ b/tests/aux_tests/test_speaker_manager.py @@ -12,7 +12,7 @@ from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json") -encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar") +encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth") sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav") sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index d643cb81..b5350b0f 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -20,7 +20,7 @@ class SynthesizerTest(unittest.TestCase): def test_in_out(self): self._create_random_model() tts_root_path = get_tests_output_path() - tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar") + tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth") tts_config = os.path.join(tts_root_path, "dummy_model_config.json") synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None) synthesizer.tts("Better this test works!!") diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json index 0cb9b948..f0a92283 100644 --- a/tests/inputs/server_config.json +++ b/tests/inputs/server_config.json @@ -1,5 +1,5 @@ { - "tts_checkpoint":"checkpoint_10.pth.tar", // tts checkpoint file + "tts_checkpoint":"checkpoint_10.pth", // tts checkpoint file "tts_config":"dummy_model_config.json", // tts config.json file "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. From 1c3623af337a61467d3a139a500db247cc8dc755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 23 Mar 2022 12:57:14 +0100 Subject: [PATCH 21/38] Fix model manager (#1436) * Fix manager * Make style --- TTS/tts/utils/text/phonemizers/__init__.py | 1 + TTS/utils/audio.py | 4 ++- TTS/utils/manage.py | 32 +++++++++++++------ recipes/ljspeech/align_tts/train_aligntts.py | 7 +++- .../ljspeech/fast_pitch/train_fast_pitch.py | 7 +++- .../ljspeech/fast_speech/train_fast_speech.py | 7 +++- recipes/ljspeech/glow_tts/train_glowtts.py | 7 +++- .../speedy_speech/train_speedy_speech.py | 7 +++- .../tacotron2-DCA/train_tacotron_dca.py | 7 +++- .../tacotron2-DDC/train_tacotron_ddc.py | 7 +++- recipes/ljspeech/vits_tts/train_vits.py | 7 +++- .../multilingual/vits_tts/train_vits_tts.py | 7 +++- recipes/vctk/fast_pitch/train_fast_pitch.py | 7 +++- recipes/vctk/fast_speech/train_fast_speech.py | 7 +++- recipes/vctk/glow_tts/train_glow_tts.py | 7 +++- .../vctk/speedy_speech/train_speedy_speech.py | 7 +++- .../vctk/tacotron-DDC/train_tacotron-DDC.py | 7 +++- .../vctk/tacotron2-DDC/train_tacotron2-ddc.py | 7 +++- recipes/vctk/tacotron2/train_tacotron2.py | 7 +++- recipes/vctk/vits/train_vits.py | 7 +++- 20 files changed, 129 insertions(+), 27 deletions(-) diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py index 90a526a7..374d0c8a 100644 --- a/TTS/tts/utils/text/phonemizers/__init__.py +++ b/TTS/tts/utils/text/phonemizers/__init__.py @@ -27,6 +27,7 @@ DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name() DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() + def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: """Initiate a phonemizer by name diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 3ed0a76a..4d435162 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -371,7 +371,9 @@ class AudioProcessor(object): self.hop_length = hop_length self.win_length = win_length assert min_level_db != 0.0, " [!] min_level_db is 0" - assert self.win_length <= self.fft_size, f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" + assert ( + self.win_length <= self.fft_size + ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" members = vars(self) if verbose: print(" > Setting up Audio Processor...") diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index dd397687..674d5a47 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -3,8 +3,8 @@ import json import os import zipfile from pathlib import Path -from typing import Tuple from shutil import copyfile, rmtree +from typing import Tuple import requests @@ -128,9 +128,6 @@ class ModelManager(object): model_item = self.models_dict[model_type][lang][dataset][model] # set the model specific output path output_path = os.path.join(self.output_prefix, model_full_name) - output_model_path = os.path.join(output_path, "model_file.pth") - output_config_path = os.path.join(output_path, "config.json") - if os.path.exists(output_path): print(f" > {model_name} is already downloaded.") else: @@ -138,13 +135,14 @@ class ModelManager(object): print(f" > Downloading model to {output_path}") # download from github release self._download_zip_file(model_item["github_rls_url"], output_path) - # update paths in the config.json - self._update_paths(output_path, output_config_path) # find downloaded files output_model_path, output_config_path = self._find_files(output_path) + # update paths in the config.json + self._update_paths(output_path, output_config_path) return output_model_path, output_config_path, model_item - def _find_files(self, output_path:str) -> Tuple[str, str]: + @staticmethod + def _find_files(output_path: str) -> Tuple[str, str]: """Find the model and config files in the output path Args: @@ -166,6 +164,22 @@ class ModelManager(object): raise ValueError(" [!] Config file not found in the output path") return model_file, config_file + @staticmethod + def _find_speaker_encoder(output_path: str) -> str: + """Find the speaker encoder file in the output path + + Args: + output_path (str): path to the model files + + Returns: + str: path to the speaker encoder file + """ + speaker_encoder_file = None + for file_name in os.listdir(output_path): + if file_name in ["model_se.pth", "model_se.pth.tar"]: + speaker_encoder_file = os.path.join(output_path, file_name) + return speaker_encoder_file + def _update_paths(self, output_path: str, config_path: str) -> None: """Update paths for certain files in config.json after download. @@ -177,7 +191,7 @@ class ModelManager(object): output_d_vector_file_path = os.path.join(output_path, "speakers.json") output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json") speaker_encoder_config_path = os.path.join(output_path, "config_se.json") - speaker_encoder_model_path = os.path.join(output_path, "model_se.pth") + speaker_encoder_model_path = self._find_speaker_encoder(output_path) # update the scale_path.npy file path in the model config.json self._update_path("audio.stats_path", output_stats_path, config_path) @@ -199,7 +213,7 @@ class ModelManager(object): @staticmethod def _update_path(field_name, new_path, config_path): """Update the path in the model config.json for the current environment after download""" - if os.path.exists(new_path): + if new_path and os.path.exists(new_path): config = load_config(config_path) field_names = field_name.split(".") if len(field_names) > 1: diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py index d27d0fa1..591b1509 100644 --- a/recipes/ljspeech/align_tts/train_aligntts.py +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -49,7 +49,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init model model = AlignTTS(config, ap, tokenizer) diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index 1f10ef07..a84658f3 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -84,7 +84,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init the model model = ForwardTTS(config, ap, tokenizer, speaker_manager=None) diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py index e5a601a7..0245dd93 100644 --- a/recipes/ljspeech/fast_speech/train_fast_speech.py +++ b/recipes/ljspeech/fast_speech/train_fast_speech.py @@ -83,7 +83,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init the model model = ForwardTTS(config, ap, tokenizer) diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py index 47d03fe3..a0b4ac48 100644 --- a/recipes/ljspeech/glow_tts/train_glowtts.py +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -60,7 +60,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # INITIALIZE THE MODEL # Models take a config object and a speaker manager as input diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py index a19e9053..1ab3db1c 100644 --- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py +++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py @@ -67,7 +67,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init model model = ForwardTTS(config, ap, tokenizer) diff --git a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py index 19a9f315..a9f253ea 100644 --- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py +++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py @@ -77,7 +77,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # INITIALIZE THE MODEL # Models take a config object and a speaker manager as input diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py index 029698d8..99089db8 100644 --- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py @@ -74,7 +74,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # INITIALIZE THE MODEL # Models take a config object and a speaker manager as input diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py index e38dc200..c070b3f1 100644 --- a/recipes/ljspeech/vits_tts/train_vits.py +++ b/recipes/ljspeech/vits_tts/train_vits.py @@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init model model = Vits(config, ap, tokenizer, speaker_manager=None) diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index 9e0cb4c8..94692f00 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -109,7 +109,12 @@ config.from_dict(config.to_dict()) ap = AudioProcessor(**config.audio.to_dict()) # load training samples -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py index d066a539..05cdc72a 100644 --- a/recipes/vctk/fast_pitch/train_fast_pitch.py +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -71,7 +71,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py index dbe23351..a294272a 100644 --- a/recipes/vctk/fast_speech/train_fast_speech.py +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py index 8a891e5d..0bf686b1 100644 --- a/recipes/vctk/glow_tts/train_glow_tts.py +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py index d9353af2..4208a9b6 100644 --- a/recipes/vctk/speedy_speech/train_speedy_speech.py +++ b/recipes/vctk/speedy_speech/train_speedy_speech.py @@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py index 14007239..d67038a4 100644 --- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py +++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py @@ -72,7 +72,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py index ab2e1bc9..b860df85 100644 --- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py index 48934e2a..d27dd78c 100644 --- a/recipes/vctk/tacotron2/train_tacotron2.py +++ b/recipes/vctk/tacotron2/train_tacotron2.py @@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index 443dbbd1..61d60ca1 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -79,7 +79,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader From 3c7c14607b0678dc45871d2ec6e5442595983429 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 23 Mar 2022 17:23:36 +0100 Subject: [PATCH 22/38] Add formatting tests (#1437) * Add style checks to `make lint` * Bump target-version in black config --- Makefile | 2 ++ pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d04cd976..69f34c79 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,8 @@ style: ## update code style. lint: ## run pylint linter. pylint ${target_dirs} + black ${target_dirs} --check + isort ${target_dirs} --check-only system-deps: ## install linux system deps sudo apt-get install -y libsndfile1-dev diff --git a/pyproject.toml b/pyproject.toml index 0941a906..b775f12a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ max-line-length=120 [tool.black] line-length = 120 -target-version = ['py38'] +target-version = ['py39'] exclude = ''' ( From 3af01cfe3b5b59281790f158494f3c11f9e7255c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 23 Mar 2022 17:24:20 +0100 Subject: [PATCH 23/38] =?UTF-8?q?Update=20base=20model=20wrt=20?= =?UTF-8?q?=F0=9F=91=9F=20(#1406)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/model.py | 142 +++++---------------------------------------------- 1 file changed, 14 insertions(+), 128 deletions(-) diff --git a/TTS/model.py b/TTS/model.py index 39cbeabc..a53b916a 100644 --- a/TTS/model.py +++ b/TTS/model.py @@ -1,46 +1,34 @@ -from abc import ABC, abstractmethod -from typing import Dict, List, Tuple +from abc import abstractmethod +from typing import Dict import torch from coqpit import Coqpit -from torch import nn +from trainer import TrainerModel # pylint: skip-file -class BaseTrainerModel(ABC, nn.Module): - """Abstract 🐸TTS class. Every new 🐸TTS model must inherit this.""" +class BaseTrainerModel(TrainerModel): + """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS. + + Every new 🐸TTS model must inherit it. + """ @staticmethod @abstractmethod def init_from_config(config: Coqpit): - """Init the model from given config. + """Init the model and all its attributes from the given config. Override this depending on your model. """ ... - @abstractmethod - def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict: - """Forward ... for the model mainly used in training. - - You can be flexible here and use different number of arguments and argument names since it is intended to be - used by `train_step()` without exposing it out of the model. - - Args: - input (torch.Tensor): Input tensor. - aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs. - - Returns: - Dict: Model outputs. Main model output must be named as "model_outputs". - """ - outputs_dict = {"model_outputs": None} - ... - return outputs_dict - @abstractmethod def inference(self, input: torch.Tensor, aux_input={}) -> Dict: - """Forward ... for inference. + """Forward pass for inference. + + It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs``` + is considered to be the main output and you can add any other auxiliary outputs as you want. We don't use `*kwargs` since it is problematic with the TorchScript API. @@ -55,78 +43,9 @@ class BaseTrainerModel(ABC, nn.Module): ... return outputs_dict - def format_batch(self, batch: Dict) -> Dict: - """Format batch returned by the data loader before sending it to the model. - - If not implemented, model uses the batch as is. - Can be used for data augmentation, feature ectraction, etc. - """ - return batch - - def format_batch_on_device(self, batch: Dict) -> Dict: - """Format batch on device before sending it to the model. - - If not implemented, model uses the batch as is. - Can be used for data augmentation, feature ectraction, etc. - """ - return batch - - @abstractmethod - def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - """Perform a single training step. Run the model forward ... and compute losses. - - Args: - batch (Dict): Input tensors. - criterion (nn.Module): Loss layer designed for the model. - - Returns: - Tuple[Dict, Dict]: Model ouputs and computed losses. - """ - outputs_dict = {} - loss_dict = {} # this returns from the criterion - ... - return outputs_dict, loss_dict - - def train_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None: - """Create visualizations and waveform examples for training. - - For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to - be projected onto Tensorboard. - - Args: - ap (AudioProcessor): audio processor used at training. - batch (Dict): Model inputs used at the previous training step. - outputs (Dict): Model outputs generated at the previoud training step. - - Returns: - Tuple[Dict, np.ndarray]: training plots and output waveform. - """ - ... - - @abstractmethod - def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - """Perform a single evaluation step. Run the model forward ... and compute losses. In most cases, you can - call `train_step()` with no changes. - - Args: - batch (Dict): Input tensors. - criterion (nn.Module): Loss layer designed for the model. - - Returns: - Tuple[Dict, Dict]: Model ouputs and computed losses. - """ - outputs_dict = {} - loss_dict = {} # this returns from the criterion - ... - return outputs_dict, loss_dict - - def eval_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None: - """The same as `train_log()`""" - ... - @abstractmethod def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None: - """Load a checkpoint and get ready for training or inference. + """Load a model checkpoint gile and get ready for training or inference. Args: config (Coqpit): Model configuration. @@ -135,36 +54,3 @@ class BaseTrainerModel(ABC, nn.Module): strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True. """ ... - - @staticmethod - @abstractmethod - def init_from_config(config: Coqpit, samples: List[Dict] = None, verbose=False) -> "BaseTrainerModel": - """Init the model from given config. - - Override this depending on your model. - """ - ... - - @abstractmethod - def get_data_loader( - self, config: Coqpit, assets: Dict, is_eval: True, data_items: List, verbose: bool, num_gpus: int - ): - ... - - # def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]: - # """Setup an return optimizer or optimizers.""" - # ... - - # def get_lr(self) -> Union[float, List[float]]: - # """Return learning rate(s). - - # Returns: - # Union[float, List[float]]: Model's initial learning rates. - # """ - # ... - - # def get_scheduler(self, optimizer: torch.optim.Optimizer): - # ... - - # def get_criterion(self): - # ... From ea53d6feb3169962bccbbc01b867f8a3bf645e9b Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Tue, 22 Mar 2022 13:53:40 -0300 Subject: [PATCH 24/38] Replace webrtcvad by silero-vad --- TTS/bin/remove_silence_using_vad.py | 75 +++++------- TTS/utils/vad.py | 181 +++++++++------------------- requirements.txt | 2 - 3 files changed, 86 insertions(+), 172 deletions(-) diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index 9070f2da..a8a60bf8 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,51 +1,24 @@ import argparse import glob -import multiprocessing import os import pathlib -from tqdm.contrib.concurrent import process_map - -from TTS.utils.vad import get_vad_speech_segments, read_wave, write_wave +from tqdm import tqdm +from TTS.utils.vad import get_vad_model_and_utils, remove_silence -def remove_silence(filepath): - output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) +def adjust_path_and_remove_silence(audio_path): + output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) # ignore if the file exists if os.path.exists(output_path) and not args.force: - return + return output_path # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) - # load wave - audio, sample_rate = read_wave(filepath) + # remove the silence and save the audio + output_path = remove_silence(model_and_utils, audio_path, output_path, trim_just_beginning_and_end=args.trim_just_beginning_and_end, use_cuda=args.use_cuda) - # get speech segments - segments = get_vad_speech_segments(audio, sample_rate, aggressiveness=args.aggressiveness) - - segments = list(segments) - num_segments = len(segments) - flag = False - # create the output wave - if num_segments != 0: - for i, segment in reversed(list(enumerate(segments))): - if i >= 1: - if not flag: - concat_segment = segment - flag = True - else: - concat_segment = segment + concat_segment - else: - if flag: - segment = segment + concat_segment - # print("Saving: ", output_path) - write_wave(output_path, segment, sample_rate) - return - else: - print("> Just Copying the file to:", output_path) - # if fail to remove silence just write the file - write_wave(output_path, audio, sample_rate) - return + return output_path def preprocess_audios(): @@ -54,17 +27,24 @@ def preprocess_audios(): if not args.force: print("> Ignoring files that already exist in the output directory.") + if args.trim_just_beginning_and_end: + print("> Trimming just the beginning and the end with nonspeech parts.") + else: + print("> Trimming all nonspeech parts.") + if files: # create threads - num_threads = multiprocessing.cpu_count() - process_map(remove_silence, files, max_workers=num_threads, chunksize=15) + # num_threads = multiprocessing.cpu_count() + # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15) + for f in tqdm(files): + adjust_path_and_remove_silence(f) else: print("> No files Found !") if __name__ == "__main__": parser = argparse.ArgumentParser( - description="python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2" + description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True" ) parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir") parser.add_argument( @@ -79,11 +59,20 @@ if __name__ == "__main__": help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav", ) parser.add_argument( - "-a", - "--aggressiveness", - type=int, - default=2, - help="set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.", + "-t", + "--trim_just_beginning_and_end", + type=bool, + default=True, + help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True", + ) + parser.add_argument( + "-c", + "--use_cuda", + type=bool, + default=False, + help="If True use cuda", ) args = parser.parse_args() + # load the model and utils + model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda) preprocess_audios() diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 923544d0..88790202 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -1,144 +1,71 @@ -# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py -import collections -import contextlib -import wave +import torch +import torchaudio -import webrtcvad +def read_audio(path): + wav, sr = torchaudio.load(path) + if wav.size(0) > 1: + wav = wav.mean(dim=0, keepdim=True) -def read_wave(path): - """Reads a .wav file. + return wav.squeeze(0), sr - Takes the path, and returns (PCM audio data, sample rate). - """ - with contextlib.closing(wave.open(path, "rb")) as wf: - num_channels = wf.getnchannels() - assert num_channels == 1 - sample_width = wf.getsampwidth() - assert sample_width == 2 - sample_rate = wf.getframerate() - assert sample_rate in (8000, 16000, 32000, 48000) - pcm_data = wf.readframes(wf.getnframes()) - return pcm_data, sample_rate +def resample_wav(wav, sr, new_sr): + wav = wav.unsqueeze(0) + transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr) + wav = transform(wav) + return wav.squeeze(0) +def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False): + factor = new_sr / vad_sr + new_timestamps = [] + if just_begging_end: + # get just the start and end timestamps + new_dict = {'start': int(timestamps[0]['start']*factor), 'end': int(timestamps[-1]['end']*factor)} + new_timestamps.append(new_dict) + else: + for ts in timestamps: + # map to the new SR + new_dict = {'start': int(ts['start']*factor), 'end': int(ts['end']*factor)} + new_timestamps.append(new_dict) -def write_wave(path, audio, sample_rate): - """Writes a .wav file. + return new_timestamps - Takes path, PCM audio data, and sample rate. - """ - with contextlib.closing(wave.open(path, "wb")) as wf: - wf.setnchannels(1) - wf.setsampwidth(2) - wf.setframerate(sample_rate) - wf.writeframes(audio) +def get_vad_model_and_utils(use_cuda=False): + model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', + model='silero_vad', + force_reload=True, + onnx=False) + if use_cuda: + model = model.cuda() + get_speech_timestamps, save_audio, _, _, collect_chunks = utils + return model, get_speech_timestamps, save_audio, collect_chunks -class Frame(object): - """Represents a "frame" of audio data.""" +def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False): - def __init__(self, _bytes, timestamp, duration): - self.bytes = _bytes - self.timestamp = timestamp - self.duration = duration + # get the VAD model and utils functions + model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils + # read ground truth wav and resample the audio for the VAD + wav, gt_sample_rate = read_audio(audio_path) -def frame_generator(frame_duration_ms, audio, sample_rate): - """Generates audio frames from PCM audio data. + # if needed, resample the audio for the VAD model + if gt_sample_rate != vad_sample_rate: + wav_vad = resample_wav(wav, gt_sample_rate, vad_sample_rate) + else: + wav_vad = wav - Takes the desired frame duration in milliseconds, the PCM data, and - the sample rate. + if use_cuda: + wav_vad = wav_vad.cuda() - Yields Frames of the requested duration. - """ - n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) - offset = 0 - timestamp = 0.0 - duration = (float(n) / sample_rate) / 2.0 - while offset + n < len(audio): - yield Frame(audio[offset : offset + n], timestamp, duration) - timestamp += duration - offset += n + # get speech timestamps from full audio file + speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768) + # map the current speech_timestamps to the sample rate of the ground truth audio + new_speech_timestamps = map_timestamps_to_new_sr(vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end) -def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): - """Filters out non-voiced audio frames. + # save audio + save_audio(out_path, + collect_chunks(new_speech_timestamps, wav), sampling_rate=gt_sample_rate) - Given a webrtcvad.Vad and a source of audio frames, yields only - the voiced audio. - - Uses a padded, sliding window algorithm over the audio frames. - When more than 90% of the frames in the window are voiced (as - reported by the VAD), the collector triggers and begins yielding - audio frames. Then the collector waits until 90% of the frames in - the window are unvoiced to detrigger. - - The window is padded at the front and back to provide a small - amount of silence or the beginnings/endings of speech around the - voiced frames. - - Arguments: - - sample_rate - The audio sample rate, in Hz. - frame_duration_ms - The frame duration in milliseconds. - padding_duration_ms - The amount to pad the window, in milliseconds. - vad - An instance of webrtcvad.Vad. - frames - a source of audio frames (sequence or generator). - - Returns: A generator that yields PCM audio data. - """ - num_padding_frames = int(padding_duration_ms / frame_duration_ms) - # We use a deque for our sliding window/ring buffer. - ring_buffer = collections.deque(maxlen=num_padding_frames) - # We have two states: TRIGGERED and NOTTRIGGERED. We start in the - # NOTTRIGGERED state. - triggered = False - - voiced_frames = [] - for frame in frames: - is_speech = vad.is_speech(frame.bytes, sample_rate) - - # sys.stdout.write('1' if is_speech else '0') - if not triggered: - ring_buffer.append((frame, is_speech)) - num_voiced = len([f for f, speech in ring_buffer if speech]) - # If we're NOTTRIGGERED and more than 90% of the frames in - # the ring buffer are voiced frames, then enter the - # TRIGGERED state. - if num_voiced > 0.9 * ring_buffer.maxlen: - triggered = True - # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) - # We want to yield all the audio we see from now until - # we are NOTTRIGGERED, but we have to start with the - # audio that's already in the ring buffer. - for f, _ in ring_buffer: - voiced_frames.append(f) - ring_buffer.clear() - else: - # We're in the TRIGGERED state, so collect the audio data - # and add it to the ring buffer. - voiced_frames.append(frame) - ring_buffer.append((frame, is_speech)) - num_unvoiced = len([f for f, speech in ring_buffer if not speech]) - # If more than 90% of the frames in the ring buffer are - # unvoiced, then enter NOTTRIGGERED and yield whatever - # audio we've collected. - if num_unvoiced > 0.9 * ring_buffer.maxlen: - # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) - triggered = False - yield b"".join([f.bytes for f in voiced_frames]) - ring_buffer.clear() - voiced_frames = [] - # If we have any leftover voiced audio when we run out of input, - # yield it. - if voiced_frames: - yield b"".join([f.bytes for f in voiced_frames]) - - -def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_duration_ms=300): - - vad = webrtcvad.Vad(int(aggressiveness)) - frames = list(frame_generator(30, audio, sample_rate)) - segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) - - return segments + return out_path diff --git a/requirements.txt b/requirements.txt index c3599220..f735c57a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,5 +34,3 @@ mecab-python3==1.0.3 unidic-lite==1.0.8 # gruut+supported langs gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3 -# others -webrtcvad # for VAD From 0ae1e0248c74f3dc820798619c2b6f6537bfb339 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Tue, 22 Mar 2022 14:53:33 -0300 Subject: [PATCH 25/38] Fix the bug for emptly audio files --- TTS/utils/vad.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 88790202..7384934a 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -18,7 +18,7 @@ def resample_wav(wav, sr, new_sr): def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False): factor = new_sr / vad_sr new_timestamps = [] - if just_begging_end: + if just_begging_end and timestamps: # get just the start and end timestamps new_dict = {'start': int(timestamps[0]['start']*factor), 'end': int(timestamps[-1]['end']*factor)} new_timestamps.append(new_dict) @@ -64,8 +64,12 @@ def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000, # map the current speech_timestamps to the sample rate of the ground truth audio new_speech_timestamps = map_timestamps_to_new_sr(vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end) - # save audio - save_audio(out_path, - collect_chunks(new_speech_timestamps, wav), sampling_rate=gt_sample_rate) + # if have speech timestamps else save the wav + if new_speech_timestamps: + wav = collect_chunks(new_speech_timestamps, wav) + else: + print(f"> The file {audio_path} probably does not have speech please check it !!") + # save audio + save_audio(out_path, wav, sampling_rate=gt_sample_rate) return out_path From 3435bc8fcad433438751e14b75dd8f5e0c36ae41 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Wed, 23 Mar 2022 15:05:32 -0300 Subject: [PATCH 26/38] Fix style tests --- TTS/bin/remove_silence_using_vad.py | 9 ++++++++- TTS/utils/vad.py | 22 ++++++++++++++-------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index a8a60bf8..7d88ae91 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -4,6 +4,7 @@ import os import pathlib from tqdm import tqdm + from TTS.utils.vad import get_vad_model_and_utils, remove_silence @@ -16,7 +17,13 @@ def adjust_path_and_remove_silence(audio_path): # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) # remove the silence and save the audio - output_path = remove_silence(model_and_utils, audio_path, output_path, trim_just_beginning_and_end=args.trim_just_beginning_and_end, use_cuda=args.use_cuda) + output_path = remove_silence( + model_and_utils, + audio_path, + output_path, + trim_just_beginning_and_end=args.trim_just_beginning_and_end, + use_cuda=args.use_cuda, + ) return output_path diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 7384934a..033b911a 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -1,6 +1,7 @@ import torch import torchaudio + def read_audio(path): wav, sr = torchaudio.load(path) @@ -9,39 +10,42 @@ def read_audio(path): return wav.squeeze(0), sr + def resample_wav(wav, sr, new_sr): wav = wav.unsqueeze(0) transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr) wav = transform(wav) return wav.squeeze(0) + def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False): factor = new_sr / vad_sr new_timestamps = [] if just_begging_end and timestamps: # get just the start and end timestamps - new_dict = {'start': int(timestamps[0]['start']*factor), 'end': int(timestamps[-1]['end']*factor)} + new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)} new_timestamps.append(new_dict) else: for ts in timestamps: # map to the new SR - new_dict = {'start': int(ts['start']*factor), 'end': int(ts['end']*factor)} + new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)} new_timestamps.append(new_dict) return new_timestamps + def get_vad_model_and_utils(use_cuda=False): - model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', - model='silero_vad', - force_reload=True, - onnx=False) + model, utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=False) if use_cuda: model = model.cuda() get_speech_timestamps, save_audio, _, _, collect_chunks = utils return model, get_speech_timestamps, save_audio, collect_chunks -def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False): + +def remove_silence( + model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False +): # get the VAD model and utils functions model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils @@ -62,7 +66,9 @@ def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000, speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768) # map the current speech_timestamps to the sample rate of the ground truth audio - new_speech_timestamps = map_timestamps_to_new_sr(vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end) + new_speech_timestamps = map_timestamps_to_new_sr( + vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end + ) # if have speech timestamps else save the wav if new_speech_timestamps: From 37896e17430a5627b4b3224603b9101f3259a446 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Thu, 24 Mar 2022 14:16:04 -0300 Subject: [PATCH 27/38] Bug fix in freeze encoder (#1391) * Fix the bug in freeze encoder * Remove emb_l definition for non-multilingual training * Fix unit tests --- TTS/tts/models/vits.py | 1 - tests/tts_tests/test_vits.py | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index afadbadd..87d559fc 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -706,7 +706,6 @@ class Vits(BaseTTS): torch.nn.init.xavier_uniform_(self.emb_l.weight) else: self.embedded_language_dim = 0 - self.emb_l = None def get_aux_input(self, aux_input: Dict): sid, g, lid = self._set_cond_input(aux_input) diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 81d2ebbd..05adb9ed 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -79,25 +79,25 @@ class TestVits(unittest.TestCase): model = Vits(args) self.assertEqual(model.language_manager, None) self.assertEqual(model.embedded_language_dim, 0) - self.assertEqual(model.emb_l, None) + assertHasNotAttr(self, model, "emb_l") args = VitsArgs(language_ids_file=LANG_FILE) model = Vits(args) self.assertNotEqual(model.language_manager, None) self.assertEqual(model.embedded_language_dim, 0) - self.assertEqual(model.emb_l, None) + assertHasNotAttr(self, model, "emb_l") args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True) model = Vits(args) self.assertNotEqual(model.language_manager, None) self.assertEqual(model.embedded_language_dim, args.embedded_language_dim) - self.assertNotEqual(model.emb_l, None) + assertHasAttr(self, model, "emb_l") args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, embedded_language_dim=102) model = Vits(args) self.assertNotEqual(model.language_manager, None) self.assertEqual(model.embedded_language_dim, args.embedded_language_dim) - self.assertNotEqual(model.emb_l, None) + assertHasAttr(self, model, "emb_l") def test_get_aux_input(self): aux_input = {"speaker_ids": None, "style_wav": None, "d_vectors": None, "language_ids": None} From c66a6241fd761ea07379849474d576f75b9c4e84 Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Fri, 25 Mar 2022 23:15:33 +0100 Subject: [PATCH 28/38] Enforce phonemizer definition for synthesis (#1441) * Enforce phonemizer definition for synthesis * Fix train_tts, tokenizer init can now edit config * Add small change to trigger CI pipeline * fix wrong output path for one tts_test * Fix style * Test config overides by args and tokenizer * Fix style --- TTS/bin/train_tts.py | 2 +- TTS/tts/utils/text/tokenizer.py | 1 + TTS/utils/synthesizer.py | 3 + requirements.txt | 2 +- tests/tts_tests/test_align_tts_train.py | 11 +++- .../test_fast_pitch_speaker_emb_train.py | 9 +++ tests/tts_tests/test_fast_pitch_train.py | 9 +++ .../test_glow_tts_d-vectors_train.py | 9 +++ .../test_glow_tts_speaker_emb_train.py | 9 +++ tests/tts_tests/test_glow_tts_train.py | 9 +++ tests/tts_tests/test_speedy_speech_train.py | 9 +++ .../test_tacotron2_d-vectors_train.py | 9 +++ .../test_tacotron2_speaker_emb_train.py | 9 +++ tests/tts_tests/test_tacotron2_train.py | 9 +++ .../test_tacotron2_train_fsspec_path.py | 55 ------------------- ...est_vits_multilingual_speaker_emb_train.py | 9 +++ .../test_vits_multilingual_train-d_vectors.py | 9 +++ .../tts_tests/test_vits_speaker_emb_train.py | 9 +++ tests/tts_tests/test_vits_train.py | 9 +++ 19 files changed, 133 insertions(+), 58 deletions(-) delete mode 100644 tests/tts_tests/test_tacotron2_train_fsspec_path.py diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 976b74af..bdb4f6f6 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -57,7 +57,7 @@ def main(): # init the trainer and 🚀 trainer = Trainer( train_args, - config, + model.config, config.output_path, model=model, train_samples=train_samples, diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index f0d85a44..1569c634 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -191,6 +191,7 @@ class TTSTokenizer: phonemizer = get_phonemizer_by_name( DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs ) + new_config.phonemizer = phonemizer.name() except KeyError as e: raise ValueError( f"""No phonemizer found for language {config.phoneme_language}. diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2ea23adb..3dd8be44 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -112,6 +112,9 @@ class Synthesizer(object): self.use_phonemes = self.tts_config.use_phonemes self.tts_model = setup_tts_model(config=self.tts_config) + if self.use_phonemes and self.tts_config["phonemizer"] is None: + raise ValueError("Phonemizer is not defined in the TTS config.") + if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config() diff --git a/requirements.txt b/requirements.txt index f735c57a..db47c2cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,7 +25,7 @@ tensorboardX pyworld # coqui stack coqui-trainer -coqpit # config managemenr +coqpit # config management # chinese g2p deps jieba pypinyin diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 85dfbbcb..75c5643c 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -42,7 +43,7 @@ command_train = ( "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs -1" + "--coqpit.test_delay_epochs 0 " ) run_cli(command_train) @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py index 37faf449..9553d745 100644 --- a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py +++ b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -74,6 +75,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_fast_pitch_train.py b/tests/tts_tests/test_fast_pitch_train.py index d2d78af4..134cd4ba 100644 --- a/tests/tts_tests/test_fast_pitch_train.py +++ b/tests/tts_tests/test_fast_pitch_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -73,6 +74,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_glow_tts_d-vectors_train.py b/tests/tts_tests/test_glow_tts_d-vectors_train.py index 14f9e4d2..3a9c8fcc 100644 --- a/tests/tts_tests/test_glow_tts_d-vectors_train.py +++ b/tests/tts_tests/test_glow_tts_d-vectors_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -61,6 +62,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = config.d_vector_file +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_glow_tts_speaker_emb_train.py b/tests/tts_tests/test_glow_tts_speaker_emb_train.py index c327332e..322b506e 100644 --- a/tests/tts_tests/test_glow_tts_speaker_emb_train.py +++ b/tests/tts_tests/test_glow_tts_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -58,6 +59,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index b0acf004..cf9a04f4 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -55,6 +56,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 9a26d253..c4adcee3 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index 6b003f2c..0d02fa98 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -61,6 +62,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = config.d_vector_file +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py index b9f4de0b..2e812d90 100644 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -59,6 +60,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 8c30d9f9..d1941022 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_train_fsspec_path.py b/tests/tts_tests/test_tacotron2_train_fsspec_path.py deleted file mode 100644 index 5d14a983..00000000 --- a/tests/tts_tests/test_tacotron2_train_fsspec_path.py +++ /dev/null @@ -1,55 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.tacotron2_config import Tacotron2Config - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_decoder_steps=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path file://{config_path} " - f"--coqpit.output_path file://{output_path} " - "--coqpit.datasets.0.name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path file://{continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py index 0c7672d7..683bb0a7 100644 --- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -92,6 +93,14 @@ languae_id = "en" continue_speakers_path = os.path.join(continue_path, "speakers.json") continue_languages_path = os.path.join(continue_path, "language_ids.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index e12661a5..e4a82cdd 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -99,6 +100,14 @@ languae_id = "en" continue_speakers_path = config.d_vector_file continue_languages_path = os.path.join(continue_path, "language_ids.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py index c928cee4..48597241 100644 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -65,6 +66,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py index 003f99a8..64ff63f3 100644 --- a/tests/tts_tests/test_vits_train.py +++ b/tests/tts_tests/test_vits_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) From 1b22f03e986134bcbcd2aba72fe8e226e07f5b9f Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 30 Mar 2022 12:47:11 +0200 Subject: [PATCH 29/38] Fix G2P backend of the released models (#1461) * Fix enforce phonemizer * Add new models * Fix .model.json --- TTS/.models.json | 76 ++++++++++++++++++++-------------------- TTS/utils/synthesizer.py | 7 ++-- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 801b8468..24838a5d 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -4,7 +4,7 @@ "multi-dataset":{ "your_tts":{ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--multilingual--multi-dataset--your_tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip", "default_vocoder": null, "commit": "e9a1953e", "license": "CC BY-NC-ND 4.0", @@ -16,7 +16,7 @@ "ek1": { "tacotron2": { "description": "EK1 en-rp tacotron2 by NMStoker", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--ek1--tacotron2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip", "default_vocoder": "vocoder_models/en/ek1/wavegrad", "commit": "c802255" } @@ -24,7 +24,7 @@ "ljspeech": { "tacotron2-DDC": { "description": "Tacotron2 with Double Decoder Consistency.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/tts_models--en--ljspeech--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "bae2ad0f", "author": "Eren Gölge @erogol", @@ -33,7 +33,7 @@ }, "tacotron2-DDC_ph": { "description": "Tacotron2 with Double Decoder Consistency with phonemes.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip", "default_vocoder": "vocoder_models/en/ljspeech/univnet", "commit": "3900448", "author": "Eren Gölge @erogol", @@ -42,7 +42,7 @@ }, "glow-tts": { "description": "", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip", "stats_file": null, "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "", @@ -52,7 +52,7 @@ }, "speedy-speech": { "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip", "stats_file": null, "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "4581e3d", @@ -62,7 +62,7 @@ }, "tacotron2-DCA": { "description": "", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip", "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "", "author": "Eren Gölge @erogol", @@ -71,7 +71,7 @@ }, "vits": { "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip", "default_vocoder": null, "commit": "3900448", "author": "Eren Gölge @erogol", @@ -80,7 +80,7 @@ }, "fast_pitch": { "description": "FastPitch model trained on LJSpeech using the Aligner Network", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.2/tts_models--en--ljspeech--fast_pitch.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip", "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "b27b3ba", "author": "Eren Gölge @erogol", @@ -91,7 +91,7 @@ "vctk": { "vits": { "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip", "default_vocoder": null, "commit": "3900448", "author": "Eren @erogol", @@ -100,7 +100,7 @@ }, "fast_pitch":{ "description": "FastPitch model trained on VCTK dataseset.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip", "default_vocoder": null, "commit": "bdab788d", "author": "Eren @erogol", @@ -111,7 +111,7 @@ "sam": { "tacotron-DDC": { "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/tts_models--en--sam--tacotron_DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip", "default_vocoder": "vocoder_models/en/sam/hifigan_v2", "commit": "bae2ad0f", "author": "Eren Gölge @erogol", @@ -123,7 +123,7 @@ "es": { "mai": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "commit": "", "author": "Eren Gölge @erogol", @@ -135,7 +135,7 @@ "fr": { "mai": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "commit": "", "author": "Eren Gölge @erogol", @@ -147,7 +147,7 @@ "uk":{ "mai": { "glow-tts": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip", "author":"@robinhad", "commit": "bdab788d", "license": "MIT", @@ -159,7 +159,7 @@ "zh-CN": { "baker": { "tacotron2-DDC-GST": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", "commit": "unknown", "author": "@kirianguiller", "default_vocoder": null @@ -169,7 +169,7 @@ "nl": { "mai": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--nl--mai--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip", "author": "@r-dh", "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan", "stats_file": null, @@ -180,7 +180,7 @@ "de": { "thorsten": { "tacotron2-DCA": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip", "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan", "author": "@thorstenMueller", "commit": "unknown" @@ -190,7 +190,7 @@ "ja": { "kokoro": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1", "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.", "author": "@kaiidams", @@ -201,7 +201,7 @@ "tr":{ "common-voice": { "glow-tts":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--tr--common-voice--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip", "default_vocoder": "vocoder_models/tr/common-voice/hifigan", "license": "MIT", "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.", @@ -213,14 +213,14 @@ "it": { "mai_female": { "glow-tts":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", "commit": null }, "vits":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", @@ -229,14 +229,14 @@ }, "mai_male": { "glow-tts":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", "commit": null }, "vits":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", @@ -249,14 +249,14 @@ "universal": { "libri-tts": { "wavegrad": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip", "commit": "ea976b0", "author": "Eren Gölge @erogol", "license": "MPL", "contact": "egolge@coqui.com" }, "fullband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip", "commit": "4132240", "author": "Eren Gölge @erogol", "license": "MPL", @@ -268,13 +268,13 @@ "ek1": { "wavegrad": { "description": "EK1 en-rp wavegrad by NMStoker", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--en--ek1--wavegrad.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip", "commit": "c802255" } }, "ljspeech": { "multiband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip", "commit": "ea976b0", "author": "Eren Gölge @erogol", "license": "MPL", @@ -282,7 +282,7 @@ }, "hifigan_v2": { "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--ljspeech-hifigan_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip", "commit": "bae2ad0f", "author": "@erogol", "license": "", @@ -290,7 +290,7 @@ }, "univnet": { "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip", "commit": "4581e3d", "author": "Eren @erogol", "license": "TBD", @@ -300,7 +300,7 @@ "vctk": { "hifigan_v2": { "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip", "commit": "2f07160", "author": "Edresson Casanova", "license": "", @@ -310,7 +310,7 @@ "sam": { "hifigan_v2": { "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip", "commit": "2f07160", "author": "Eren Gölge @erogol", "license": "", @@ -321,7 +321,7 @@ "nl": { "mai": { "parallel-wavegan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--nl--mai--parallel-wavegan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip", "author": "@r-dh", "commit": "unknown" } @@ -330,12 +330,12 @@ "de": { "thorsten": { "wavegrad": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip", "author": "@thorstenMueller", "commit": "unknown" }, "fullband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip", "author": "@thorstenMueller", "commit": "unknown" } @@ -344,7 +344,7 @@ "ja": { "kokoro": { "hifigan_v1": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/vocoder_models--ja--kokoro--hifigan_v1.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip", "description": "HifiGAN model trained for kokoro dataset by @kaiidams", "author": "@kaiidams", "commit": "3900448" @@ -354,7 +354,7 @@ "uk": { "mai": { "multiband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/vocoder_models--uk--mai--multiband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip", "author":"@robinhad", "commit": "bdab788d", "license": "MIT", @@ -365,7 +365,7 @@ "tr":{ "common-voice": { "hifigan":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/vocoder_models--tr--common-voice--hifigan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip", "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.", "author": "Fatih Akademi", "license": "MIT", diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 3dd8be44..eef4086c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -109,12 +109,11 @@ class Synthesizer(object): """ # pylint: disable=global-statement self.tts_config = load_config(tts_config_path) - self.use_phonemes = self.tts_config.use_phonemes - self.tts_model = setup_tts_model(config=self.tts_config) - - if self.use_phonemes and self.tts_config["phonemizer"] is None: + if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None: raise ValueError("Phonemizer is not defined in the TTS config.") + self.tts_model = setup_tts_model(config=self.tts_config) + if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config() From 060e0f9368eb6237cf330502b9869b4e87de6c12 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Thu, 31 Mar 2022 08:41:16 -0300 Subject: [PATCH 30/38] Add EmbeddingManager and BaseIDManager (#1374) --- TTS/bin/compute_embeddings.py | 6 +- TTS/bin/eval_encoder.py | 10 +- TTS/bin/extract_tts_spectrograms.py | 4 +- TTS/bin/synthesize.py | 4 +- TTS/bin/train_encoder.py | 4 +- TTS/encoder/utils/generic_utils.py | 2 +- TTS/server/server.py | 2 +- TTS/tts/models/base_tts.py | 32 +- TTS/tts/models/glow_tts.py | 2 +- TTS/tts/models/vits.py | 44 ++- TTS/tts/utils/languages.py | 45 +-- TTS/tts/utils/managers.py | 285 ++++++++++++++++ TTS/tts/utils/speakers.py | 308 ++---------------- TTS/utils/synthesizer.py | 18 +- .../multilingual/vits_tts/train_vits_tts.py | 2 +- recipes/vctk/fast_pitch/train_fast_pitch.py | 2 +- recipes/vctk/fast_speech/train_fast_speech.py | 2 +- recipes/vctk/glow_tts/train_glow_tts.py | 2 +- .../vctk/speedy_speech/train_speedy_speech.py | 2 +- .../vctk/tacotron-DDC/train_tacotron-DDC.py | 2 +- .../vctk/tacotron2-DDC/train_tacotron2-ddc.py | 2 +- recipes/vctk/tacotron2/train_tacotron2.py | 2 +- recipes/vctk/vits/train_vits.py | 2 +- tests/aux_tests/test_speaker_manager.py | 22 +- tests/tts_tests/test_glow_tts.py | 2 +- tests/tts_tests/test_vits.py | 6 +- tests/zoo_tests/test_models.py | 2 +- 27 files changed, 412 insertions(+), 404 deletions(-) create mode 100644 TTS/tts/utils/managers.py diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index d7a2c5f6..b62d603a 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -49,7 +49,7 @@ encoder_manager = SpeakerManager( use_cuda=args.use_cuda, ) -class_name_key = encoder_manager.speaker_encoder_config.class_name_key +class_name_key = encoder_manager.encoder_config.class_name_key # compute speaker embeddings speaker_mapping = {} @@ -63,10 +63,10 @@ for idx, wav_file in enumerate(tqdm(wav_files)): wav_file_name = os.path.basename(wav_file) if args.old_file is not None and wav_file_name in encoder_manager.clip_ids: # get the embedding from the old file - embedd = encoder_manager.get_d_vector_by_clip(wav_file_name) + embedd = encoder_manager.get_embedding_by_clip(wav_file_name) else: # extract the embedding - embedd = encoder_manager.compute_d_vector_from_clip(wav_file) + embedd = encoder_manager.compute_embedding_from_clip(wav_file) # create speaker_mapping if target dataset is defined speaker_mapping[wav_file_name] = {} diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py index 089f3645..7f9fdf93 100644 --- a/TTS/bin/eval_encoder.py +++ b/TTS/bin/eval_encoder.py @@ -11,8 +11,8 @@ from TTS.tts.utils.speakers import SpeakerManager def compute_encoder_accuracy(dataset_items, encoder_manager): - class_name_key = encoder_manager.speaker_encoder_config.class_name_key - map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, "map_classid_to_classname", None) + class_name_key = encoder_manager.encoder_config.class_name_key + map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None) class_acc_dict = {} @@ -22,13 +22,13 @@ def compute_encoder_accuracy(dataset_items, encoder_manager): wav_file = item["audio_file"] # extract the embedding - embedd = encoder_manager.compute_d_vector_from_clip(wav_file) - if encoder_manager.speaker_encoder_criterion is not None and map_classid_to_classname is not None: + embedd = encoder_manager.compute_embedding_from_clip(wav_file) + if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None: embedding = torch.FloatTensor(embedd).unsqueeze(0) if encoder_manager.use_cuda: embedding = embedding.cuda() - class_id = encoder_manager.speaker_encoder_criterion.softmax.inference(embedding).item() + class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item() predicted_label = map_classid_to_classname[str(class_id)] else: predicted_label = None diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index fa63c46a..a0dd0549 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -37,8 +37,8 @@ def setup_loader(ap, r, verbose=False): precompute_num_workers=0, use_noise_augment=False, verbose=verbose, - speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None, - d_vector_mapping=speaker_manager.d_vectors if c.use_d_vector_file else None, + speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None, + d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None, ) if c.use_phonemes and c.compute_input_seq_cache: diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index eb166bc8..6247b2a4 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -278,7 +278,7 @@ If you don't specify any models, then it uses LJSpeech based English model. print( " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) - print(synthesizer.tts_model.speaker_manager.speaker_ids) + print(synthesizer.tts_model.speaker_manager.ids) return # query langauge ids of a multi-lingual model. @@ -286,7 +286,7 @@ If you don't specify any models, then it uses LJSpeech based English model. print( " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." ) - print(synthesizer.tts_model.language_manager.language_id_mapping) + print(synthesizer.tts_model.language_manager.ids) return # check the arguments against a multi-speaker model. diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index b8d38bac..d28f188e 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -12,7 +12,7 @@ from trainer.torch import NoamLR from trainer.trainer_utils import get_optimizer from TTS.encoder.dataset import EncoderDataset -from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_speaker_encoder_model +from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model from TTS.encoder.utils.samplers import PerfectBatchSampler from TTS.encoder.utils.training import init_training from TTS.encoder.utils.visual import plot_embeddings @@ -258,7 +258,7 @@ def main(args): # pylint: disable=redefined-outer-name global train_classes ap = AudioProcessor(**c.audio) - model = setup_speaker_encoder_model(c) + model = setup_encoder_model(c) optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model) diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 19c00582..91a896f6 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -125,7 +125,7 @@ def to_camel(text): return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) -def setup_speaker_encoder_model(config: "Coqpit"): +def setup_encoder_model(config: "Coqpit"): if config.model_params["model_name"].lower() == "lstm": model = LSTMSpeakerEncoder( config.model_params["input_dim"], diff --git a/TTS/server/server.py b/TTS/server/server.py index aef507fd..fd53e76d 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -143,7 +143,7 @@ def index(): "index.html", show_details=args.show_details, use_multi_speaker=use_multi_speaker, - speaker_ids=speaker_manager.speaker_ids if speaker_manager is not None else None, + speaker_ids=speaker_manager.ids if speaker_manager is not None else None, use_gst=use_gst, ) diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 945c031f..652b77dd 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -136,18 +136,18 @@ class BaseTTS(BaseTrainerModel): if hasattr(self, "speaker_manager"): if config.use_d_vector_file: if speaker_name is None: - d_vector = self.speaker_manager.get_random_d_vector() + d_vector = self.speaker_manager.get_random_embeddings() else: - d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) + d_vector = self.speaker_manager.get_d_vector_by_name(speaker_name) elif config.use_speaker_embedding: if speaker_name is None: - speaker_id = self.speaker_manager.get_random_speaker_id() + speaker_id = self.speaker_manager.get_random_id() else: - speaker_id = self.speaker_manager.speaker_ids[speaker_name] + speaker_id = self.speaker_manager.ids[speaker_name] # get language id if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: - language_id = self.language_manager.language_id_mapping[language_name] + language_id = self.language_manager.ids[language_name] return { "text": text, @@ -279,23 +279,19 @@ class BaseTTS(BaseTrainerModel): # setup multi-speaker attributes if hasattr(self, "speaker_manager") and self.speaker_manager is not None: if hasattr(config, "model_args"): - speaker_id_mapping = ( - self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None - ) - d_vector_mapping = self.speaker_manager.d_vectors if config.model_args.use_d_vector_file else None + speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None + d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None config.use_d_vector_file = config.model_args.use_d_vector_file else: - speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None - d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None + speaker_id_mapping = self.speaker_manager.ids if config.use_speaker_embedding else None + d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None else: speaker_id_mapping = None d_vector_mapping = None # setup multi-lingual attributes if hasattr(self, "language_manager") and self.language_manager is not None: - language_id_mapping = ( - self.language_manager.language_id_mapping if self.args.use_language_embedding else None - ) + language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None else: language_id_mapping = None @@ -352,13 +348,13 @@ class BaseTTS(BaseTrainerModel): d_vector = None if self.config.use_d_vector_file: - d_vector = [self.speaker_manager.d_vectors[name]["embedding"] for name in self.speaker_manager.d_vectors] + d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings] d_vector = (random.sample(sorted(d_vector), 1),) aux_inputs = { "speaker_id": None if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1), + else random.sample(sorted(self.speaker_manager.ids.values()), 1), "d_vector": d_vector, "style_wav": None, # TODO: handle GST style input } @@ -405,7 +401,7 @@ class BaseTTS(BaseTrainerModel): """Save the speaker.json and language_ids.json at the beginning of the training. Also update both paths.""" if self.speaker_manager is not None: output_path = os.path.join(trainer.output_path, "speakers.json") - self.speaker_manager.save_speaker_ids_to_file(output_path) + self.speaker_manager.save_ids_to_file(output_path) trainer.config.speakers_file = output_path # some models don't have `model_args` set if hasattr(trainer.config, "model_args"): @@ -416,7 +412,7 @@ class BaseTTS(BaseTrainerModel): if hasattr(self, "language_manager") and self.language_manager is not None: output_path = os.path.join(trainer.output_path, "language_ids.json") - self.language_manager.save_language_ids_to_file(output_path) + self.language_manager.save_ids_to_file(output_path) trainer.config.language_ids_file = output_path if hasattr(trainer.config, "model_args"): trainer.config.model_args.language_ids_file = output_path diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index fea570a6..7c0f95e1 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -124,7 +124,7 @@ class GlowTTS(BaseTTS): ) if self.speaker_manager is not None: assert ( - config.d_vector_dim == self.speaker_manager.d_vector_dim + config.d_vector_dim == self.speaker_manager.embedding_dim ), " [!] d-vector dimension mismatch b/w config and speaker manager." # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 87d559fc..943b9eae 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -652,28 +652,28 @@ class Vits(BaseTTS): # TODO: make this a function if self.args.use_speaker_encoder_as_loss: - if self.speaker_manager.speaker_encoder is None and ( + if self.speaker_manager.encoder is None and ( not self.args.speaker_encoder_model_path or not self.args.speaker_encoder_config_path ): raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" ) - self.speaker_manager.speaker_encoder.eval() + self.speaker_manager.encoder.eval() print(" > External Speaker Encoder Loaded !!") if ( - hasattr(self.speaker_manager.speaker_encoder, "audio_config") - and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] + hasattr(self.speaker_manager.encoder, "audio_config") + and self.config.audio["sample_rate"] != self.speaker_manager.encoder.audio_config["sample_rate"] ): self.audio_transform = torchaudio.transforms.Resample( orig_freq=self.audio_config["sample_rate"], - new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"], + new_freq=self.speaker_manager.encoder.audio_config["sample_rate"], ) # pylint: disable=W0101,W0105 self.audio_transform = torchaudio.transforms.Resample( orig_freq=self.config.audio.sample_rate, - new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"], + new_freq=self.speaker_manager.encoder.audio_config["sample_rate"], ) def _init_speaker_embedding(self): @@ -887,7 +887,7 @@ class Vits(BaseTTS): pad_short=True, ) - if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None: + if self.args.use_speaker_encoder_as_loss and self.speaker_manager.encoder is not None: # concate generated and GT waveforms wavs_batch = torch.cat((wav_seg, o), dim=0) @@ -896,7 +896,7 @@ class Vits(BaseTTS): if self.audio_transform is not None: wavs_batch = self.audio_transform(wavs_batch) - pred_embs = self.speaker_manager.speaker_encoder.forward(wavs_batch, l2_norm=True) + pred_embs = self.speaker_manager.encoder.forward(wavs_batch, l2_norm=True) # split generated and GT speaker embeddings gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) @@ -1223,18 +1223,18 @@ class Vits(BaseTTS): if hasattr(self, "speaker_manager"): if config.use_d_vector_file: if speaker_name is None: - d_vector = self.speaker_manager.get_random_d_vector() + d_vector = self.speaker_manager.get_random_embeddings() else: - d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False) + d_vector = self.speaker_manager.get_mean_embedding(speaker_name, num_samples=None, randomize=False) elif config.use_speaker_embedding: if speaker_name is None: - speaker_id = self.speaker_manager.get_random_speaker_id() + speaker_id = self.speaker_manager.get_random_id() else: - speaker_id = self.speaker_manager.speaker_ids[speaker_name] + speaker_id = self.speaker_manager.ids[speaker_name] # get language id if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: - language_id = self.language_manager.language_id_mapping[language_name] + language_id = self.language_manager.ids[language_name] return { "text": text, @@ -1289,26 +1289,22 @@ class Vits(BaseTTS): d_vectors = None # get numerical speaker ids from speaker names - if self.speaker_manager is not None and self.speaker_manager.speaker_ids and self.args.use_speaker_embedding: - speaker_ids = [self.speaker_manager.speaker_ids[sn] for sn in batch["speaker_names"]] + if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding: + speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]] if speaker_ids is not None: speaker_ids = torch.LongTensor(speaker_ids) batch["speaker_ids"] = speaker_ids # get d_vectors from audio file names - if self.speaker_manager is not None and self.speaker_manager.d_vectors and self.args.use_d_vector_file: - d_vector_mapping = self.speaker_manager.d_vectors + if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file: + d_vector_mapping = self.speaker_manager.embeddings d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_files"]] d_vectors = torch.FloatTensor(d_vectors) # get language ids from language names - if ( - self.language_manager is not None - and self.language_manager.language_id_mapping - and self.args.use_language_embedding - ): - language_ids = [self.language_manager.language_id_mapping[ln] for ln in batch["language_names"]] + if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding: + language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]] if language_ids is not None: language_ids = torch.LongTensor(language_ids) @@ -1490,7 +1486,7 @@ class Vits(BaseTTS): language_manager = LanguageManager.init_from_config(config) if config.model_args.speaker_encoder_model_path: - speaker_manager.init_speaker_encoder( + speaker_manager.init_encoder( config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path ) return Vits(new_config, ap, tokenizer, speaker_manager, language_manager) diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 7decabb0..9b5e2007 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,6 +1,5 @@ -import json import os -from typing import Dict, List +from typing import Any, Dict, List import fsspec import numpy as np @@ -8,9 +7,10 @@ import torch from coqpit import Coqpit from TTS.config import check_config_and_model_args +from TTS.tts.utils.managers import BaseIDManager -class LanguageManager: +class LanguageManager(BaseIDManager): """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information in a way that can be queried by language. @@ -25,37 +25,23 @@ class LanguageManager: >>> language_id_mapper = manager.language_ids """ - language_id_mapping: Dict = {} - def __init__( self, language_ids_file_path: str = "", config: Coqpit = None, ): - self.language_id_mapping = {} - if language_ids_file_path: - self.set_language_ids_from_file(language_ids_file_path) + super().__init__(id_file_path=language_ids_file_path) if config: self.set_language_ids_from_config(config) - @staticmethod - def _load_json(json_file_path: str) -> Dict: - with fsspec.open(json_file_path, "r") as f: - return json.load(f) - - @staticmethod - def _save_json(json_file_path: str, data: dict) -> None: - with fsspec.open(json_file_path, "w") as f: - json.dump(data, f, indent=4) - @property def num_languages(self) -> int: - return len(list(self.language_id_mapping.keys())) + return len(list(self.ids.keys())) @property def language_names(self) -> List: - return list(self.language_id_mapping.keys()) + return list(self.ids.keys()) @staticmethod def parse_language_ids_from_config(c: Coqpit) -> Dict: @@ -79,25 +65,24 @@ class LanguageManager: """Set language IDs from config samples. Args: - items (List): Data sampled returned by `load_meta_data()`. + c (Coqpit): Config. """ - self.language_id_mapping = self.parse_language_ids_from_config(c) + self.ids = self.parse_language_ids_from_config(c) - def set_language_ids_from_file(self, file_path: str) -> None: - """Load language ids from a json file. + @staticmethod + def parse_ids_from_data(items: List, parse_key: str) -> Any: + raise NotImplementedError - Args: - file_path (str): Path to the target json file. - """ - self.language_id_mapping = self._load_json(file_path) + def set_ids_from_data(self, items: List, parse_key: str) -> Any: + raise NotImplementedError - def save_language_ids_to_file(self, file_path: str) -> None: + def save_ids_to_file(self, file_path: str) -> None: """Save language IDs to a json file. Args: file_path (str): Path to the output file. """ - self._save_json(file_path, self.language_id_mapping) + self._save_json(file_path, self.ids) @staticmethod def init_from_config(config: Coqpit) -> "LanguageManager": diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py new file mode 100644 index 00000000..85ed53cc --- /dev/null +++ b/TTS/tts/utils/managers.py @@ -0,0 +1,285 @@ +import json +import random +from typing import Any, Dict, List, Tuple, Union + +import fsspec +import numpy as np +import torch + +from TTS.config import load_config +from TTS.encoder.utils.generic_utils import setup_encoder_model +from TTS.utils.audio import AudioProcessor + + +class BaseIDManager: + """Base `ID` Manager class. Every new `ID` manager must inherit this. + It defines common `ID` manager specific functions. + """ + + def __init__(self, id_file_path: str = ""): + self.ids = {} + + if id_file_path: + self.load_ids_from_file(id_file_path) + + @staticmethod + def _load_json(json_file_path: str) -> Dict: + with fsspec.open(json_file_path, "r") as f: + return json.load(f) + + @staticmethod + def _save_json(json_file_path: str, data: dict) -> None: + with fsspec.open(json_file_path, "w") as f: + json.dump(data, f, indent=4) + + def set_ids_from_data(self, items: List, parse_key: str) -> None: + """Set IDs from data samples. + + Args: + items (List): Data sampled returned by `load_tts_samples()`. + """ + self.ids = self.parse_ids_from_data(items, parse_key=parse_key) + + def load_ids_from_file(self, file_path: str) -> None: + """Set IDs from a file. + + Args: + file_path (str): Path to the file. + """ + self.ids = self._load_json(file_path) + + def save_ids_to_file(self, file_path: str) -> None: + """Save IDs to a json file. + + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.ids) + + def get_random_id(self) -> Any: + """Get a random embedding. + + Args: + + Returns: + np.ndarray: embedding. + """ + if self.ids: + return self.ids[random.choices(list(self.ids.keys()))[0]] + + return None + + @staticmethod + def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]: + """Parse IDs from data samples retured by `load_tts_samples()`. + + Args: + items (list): Data sampled returned by `load_tts_samples()`. + parse_key (str): The key to being used to parse the data. + Returns: + Tuple[Dict]: speaker IDs. + """ + classes = sorted({item[parse_key] for item in items}) + ids = {name: i for i, name in enumerate(classes)} + return ids + + +class EmbeddingManager(BaseIDManager): + """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this. + It defines common `Embedding` manager specific functions. + """ + + def __init__( + self, + embedding_file_path: str = "", + id_file_path: str = "", + encoder_model_path: str = "", + encoder_config_path: str = "", + use_cuda: bool = False, + ): + super().__init__(id_file_path=id_file_path) + + self.embeddings = {} + self.embeddings_by_names = {} + self.clip_ids = [] + self.encoder = None + self.encoder_ap = None + self.use_cuda = use_cuda + + if embedding_file_path: + self.load_embeddings_from_file(embedding_file_path) + + if encoder_model_path and encoder_config_path: + self.init_encoder(encoder_model_path, encoder_config_path) + + @property + def embedding_dim(self): + """Dimensionality of embeddings. If embeddings are not loaded, returns zero.""" + if self.embeddings: + return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"]) + return 0 + + def save_embeddings_to_file(self, file_path: str) -> None: + """Save embeddings to a json file. + + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.embeddings) + + def load_embeddings_from_file(self, file_path: str) -> None: + """Load embeddings from a json file. + + Args: + file_path (str): Path to the target json file. + """ + self.embeddings = self._load_json(file_path) + + speakers = sorted({x["name"] for x in self.embeddings.values()}) + self.ids = {name: i for i, name in enumerate(speakers)} + + self.clip_ids = list(set(sorted(clip_name for clip_name in self.embeddings.keys()))) + # cache embeddings_by_names for fast inference using a bigger speakers.json + self.embeddings_by_names = self.get_embeddings_by_names() + + def get_embedding_by_clip(self, clip_idx: str) -> List: + """Get embedding by clip ID. + + Args: + clip_idx (str): Target clip ID. + + Returns: + List: embedding as a list. + """ + return self.embeddings[clip_idx]["embedding"] + + def get_embeddings_by_name(self, idx: str) -> List[List]: + """Get all embeddings of a speaker. + + Args: + idx (str): Target name. + + Returns: + List[List]: all the embeddings of the given speaker. + """ + return self.embeddings_by_names[idx] + + def get_embeddings_by_names(self) -> Dict: + """Get all embeddings by names. + + Returns: + Dict: all the embeddings of each speaker. + """ + embeddings_by_names = {} + for x in self.embeddings.values(): + if x["name"] not in embeddings_by_names.keys(): + embeddings_by_names[x["name"]] = [x["embedding"]] + else: + embeddings_by_names[x["name"]].append(x["embedding"]) + return embeddings_by_names + + def get_mean_embedding(self, idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: + """Get mean embedding of a idx. + + Args: + idx (str): Target name. + num_samples (int, optional): Number of samples to be averaged. Defaults to None. + randomize (bool, optional): Pick random `num_samples` of embeddings. Defaults to False. + + Returns: + np.ndarray: Mean embedding. + """ + embeddings = self.get_embeddings_by_name(idx) + if num_samples is None: + embeddings = np.stack(embeddings).mean(0) + else: + assert len(embeddings) >= num_samples, f" [!] {idx} has number of samples < {num_samples}" + if randomize: + embeddings = np.stack(random.choices(embeddings, k=num_samples)).mean(0) + else: + embeddings = np.stack(embeddings[:num_samples]).mean(0) + return embeddings + + def get_random_embedding(self) -> Any: + """Get a random embedding. + + Args: + + Returns: + np.ndarray: embedding. + """ + if self.embeddings: + return self.embeddings[random.choices(list(self.embeddings.keys()))[0]]["embedding"] + + return None + + def get_clips(self) -> List: + return sorted(self.embeddings.keys()) + + def init_encoder(self, model_path: str, config_path: str) -> None: + """Initialize a speaker encoder model. + + Args: + model_path (str): Model file path. + config_path (str): Model config file path. + """ + self.encoder_config = load_config(config_path) + self.encoder = setup_encoder_model(self.encoder_config) + self.encoder_criterion = self.encoder.load_checkpoint( + self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda + ) + self.encoder_ap = AudioProcessor(**self.encoder_config.audio) + + def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list: + """Compute a embedding from a given audio file. + + Args: + wav_file (Union[str, List[str]]): Target file path. + + Returns: + list: Computed embedding. + """ + + def _compute(wav_file: str): + waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate) + if not self.encoder_config.model_params.get("use_torch_spec", False): + m_input = self.encoder_ap.melspectrogram(waveform) + m_input = torch.from_numpy(m_input) + else: + m_input = torch.from_numpy(waveform) + + if self.use_cuda: + m_input = m_input.cuda() + m_input = m_input.unsqueeze(0) + embedding = self.encoder.compute_embedding(m_input) + return embedding + + if isinstance(wav_file, list): + # compute the mean embedding + embeddings = None + for wf in wav_file: + embedding = _compute(wf) + if embeddings is None: + embeddings = embedding + else: + embeddings += embedding + return (embeddings / len(wav_file))[0].tolist() + embedding = _compute(wav_file) + return embedding[0].tolist() + + def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List: + """Compute embedding from features. + + Args: + feats (Union[torch.Tensor, np.ndarray]): Input features. + + Returns: + List: computed embedding. + """ + if isinstance(feats, np.ndarray): + feats = torch.from_numpy(feats) + if feats.ndim == 2: + feats = feats.unsqueeze(0) + if self.use_cuda: + feats = feats.cuda() + return self.encoder.compute_embedding(feats) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 0227412d..284d0179 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,19 +1,17 @@ import json import os -import random -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Dict, List, Union import fsspec import numpy as np import torch from coqpit import Coqpit -from TTS.config import get_from_config_or_model_args_with_default, load_config -from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model -from TTS.utils.audio import AudioProcessor +from TTS.config import get_from_config_or_model_args_with_default +from TTS.tts.utils.managers import EmbeddingManager -class SpeakerManager: +class SpeakerManager(EmbeddingManager): """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information in a way that can be queried by speaker or clip. @@ -50,7 +48,7 @@ class SpeakerManager: >>> # load a sample audio and compute embedding >>> waveform = ap.load_wav(sample_wav_path) >>> mel = ap.melspectrogram(waveform) - >>> d_vector = manager.compute_d_vector(mel.T) + >>> d_vector = manager.compute_embeddings(mel.T) """ def __init__( @@ -62,279 +60,27 @@ class SpeakerManager: encoder_config_path: str = "", use_cuda: bool = False, ): - - self.d_vectors = {} - self.speaker_ids = {} - self.d_vectors_by_speakers = {} - self.clip_ids = [] - self.speaker_encoder = None - self.speaker_encoder_ap = None - self.use_cuda = use_cuda + super().__init__( + embedding_file_path=d_vectors_file_path, + id_file_path=speaker_id_file_path, + encoder_model_path=encoder_model_path, + encoder_config_path=encoder_config_path, + use_cuda=use_cuda, + ) if data_items: - self.speaker_ids, _ = self.parse_speakers_from_data(data_items) - - if d_vectors_file_path: - self.set_d_vectors_from_file(d_vectors_file_path) - - if speaker_id_file_path: - self.set_speaker_ids_from_file(speaker_id_file_path) - - if encoder_model_path and encoder_config_path: - self.init_speaker_encoder(encoder_model_path, encoder_config_path) - - @staticmethod - def _load_json(json_file_path: str) -> Dict: - with fsspec.open(json_file_path, "r") as f: - return json.load(f) - - @staticmethod - def _save_json(json_file_path: str, data: dict) -> None: - with fsspec.open(json_file_path, "w") as f: - json.dump(data, f, indent=4) + self.set_ids_from_data(data_items, parse_key="speaker_name") @property def num_speakers(self): - return len(self.speaker_ids) + return len(self.ids) @property def speaker_names(self): - return list(self.speaker_ids.keys()) - - @property - def d_vector_dim(self): - """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero.""" - if self.d_vectors: - return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"]) - return 0 - - @staticmethod - def parse_speakers_from_data(items: list) -> Tuple[Dict, int]: - """Parse speaker IDs from data samples retured by `load_tts_samples()`. - - Args: - items (list): Data sampled returned by `load_tts_samples()`. - - Returns: - Tuple[Dict, int]: speaker IDs and number of speakers. - """ - speakers = sorted({item["speaker_name"] for item in items}) - speaker_ids = {name: i for i, name in enumerate(speakers)} - num_speakers = len(speaker_ids) - return speaker_ids, num_speakers - - def set_speaker_ids_from_data(self, items: List) -> None: - """Set speaker IDs from data samples. - - Args: - items (List): Data sampled returned by `load_tts_samples()`. - """ - self.speaker_ids, _ = self.parse_speakers_from_data(items) - - def set_speaker_ids_from_file(self, file_path: str) -> None: - """Set speaker IDs from a file. - - Args: - file_path (str): Path to the file. - """ - self.speaker_ids = self._load_json(file_path) - - def save_speaker_ids_to_file(self, file_path: str) -> None: - """Save speaker IDs to a json file. - - Args: - file_path (str): Path to the output file. - """ - self._save_json(file_path, self.speaker_ids) - - def save_d_vectors_to_file(self, file_path: str) -> None: - """Save d_vectors to a json file. - - Args: - file_path (str): Path to the output file. - """ - self._save_json(file_path, self.d_vectors) - - def set_d_vectors_from_file(self, file_path: str) -> None: - """Load d_vectors from a json file. - - Args: - file_path (str): Path to the target json file. - """ - self.d_vectors = self._load_json(file_path) - - speakers = sorted({x["name"] for x in self.d_vectors.values()}) - self.speaker_ids = {name: i for i, name in enumerate(speakers)} - - self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) - # cache d_vectors_by_speakers for fast inference using a bigger speakers.json - self.d_vectors_by_speakers = self.get_d_vectors_by_speakers() - - def get_d_vector_by_clip(self, clip_idx: str) -> List: - """Get d_vector by clip ID. - - Args: - clip_idx (str): Target clip ID. - - Returns: - List: d_vector as a list. - """ - return self.d_vectors[clip_idx]["embedding"] - - def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]: - """Get all d_vectors of a speaker. - - Args: - speaker_idx (str): Target speaker ID. - - Returns: - List[List]: all the d_vectors of the given speaker. - """ - return self.d_vectors_by_speakers[speaker_idx] - - def get_d_vectors_by_speakers(self) -> Dict: - """Get all d_vectors by speaker. - - Returns: - Dict: all the d_vectors of each speaker. - """ - d_vectors_by_speakers = {} - for x in self.d_vectors.values(): - if x["name"] not in d_vectors_by_speakers.keys(): - d_vectors_by_speakers[x["name"]] = [x["embedding"]] - else: - d_vectors_by_speakers[x["name"]].append(x["embedding"]) - return d_vectors_by_speakers - - def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: - """Get mean d_vector of a speaker ID. - - Args: - speaker_idx (str): Target speaker ID. - num_samples (int, optional): Number of samples to be averaged. Defaults to None. - randomize (bool, optional): Pick random `num_samples` of d_vectors. Defaults to False. - - Returns: - np.ndarray: Mean d_vector. - """ - d_vectors = self.get_d_vectors_by_speaker(speaker_idx) - if num_samples is None: - d_vectors = np.stack(d_vectors).mean(0) - else: - assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" - if randomize: - d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0) - else: - d_vectors = np.stack(d_vectors[:num_samples]).mean(0) - return d_vectors - - def get_random_speaker_id(self) -> Any: - """Get a random d_vector. - - Args: - - Returns: - np.ndarray: d_vector. - """ - if self.speaker_ids: - return self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]] - - return None - - def get_random_d_vector(self) -> Any: - """Get a random D ID. - - Args: - - Returns: - np.ndarray: d_vector. - """ - if self.d_vectors: - return self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"] - - return None + return list(self.ids.keys()) def get_speakers(self) -> List: - return self.speaker_ids - - def get_clips(self) -> List: - return sorted(self.d_vectors.keys()) - - def init_speaker_encoder(self, model_path: str, config_path: str) -> None: - """Initialize a speaker encoder model. - - Args: - model_path (str): Model file path. - config_path (str): Model config file path. - """ - self.speaker_encoder_config = load_config(config_path) - self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) - self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint( - self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda - ) - self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) - - def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list: - """Compute a d_vector from a given audio file. - - Args: - wav_file (Union[str, List[str]]): Target file path. - - Returns: - list: Computed d_vector. - """ - - def _compute(wav_file: str): - waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) - if not self.speaker_encoder_config.model_params.get("use_torch_spec", False): - m_input = self.speaker_encoder_ap.melspectrogram(waveform) - m_input = torch.from_numpy(m_input) - else: - m_input = torch.from_numpy(waveform) - - if self.use_cuda: - m_input = m_input.cuda() - m_input = m_input.unsqueeze(0) - d_vector = self.speaker_encoder.compute_embedding(m_input) - return d_vector - - if isinstance(wav_file, list): - # compute the mean d_vector - d_vectors = None - for wf in wav_file: - d_vector = _compute(wf) - if d_vectors is None: - d_vectors = d_vector - else: - d_vectors += d_vector - return (d_vectors / len(wav_file))[0].tolist() - d_vector = _compute(wav_file) - return d_vector[0].tolist() - - def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: - """Compute d_vector from features. - - Args: - feats (Union[torch.Tensor, np.ndarray]): Input features. - - Returns: - List: computed d_vector. - """ - if isinstance(feats, np.ndarray): - feats = torch.from_numpy(feats) - if feats.ndim == 2: - feats = feats.unsqueeze(0) - if self.use_cuda: - feats = feats.cuda() - return self.speaker_encoder.compute_embedding(feats) - - def run_umap(self): - # TODO: implement speaker encoder - raise NotImplementedError - - def plot_embeddings(self): - # TODO: implement speaker encoder - raise NotImplementedError + return self.ids @staticmethod def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager": @@ -420,7 +166,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, speaker_manager = SpeakerManager() if c.use_speaker_embedding: if data is not None: - speaker_manager.set_speaker_ids_from_data(data) + speaker_manager.set_ids_from_data(data, parse_key="speaker_name") if restore_path: speakers_file = _set_file_path(restore_path) # restoring speaker manager from a previous run. @@ -432,27 +178,27 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" ) - speaker_manager.load_d_vectors_file(c.d_vector_file) - speaker_manager.set_d_vectors_from_file(speakers_file) + speaker_manager.load_embeddings_from_file(c.d_vector_file) + speaker_manager.load_embeddings_from_file(speakers_file) elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. - speaker_ids_from_data = speaker_manager.speaker_ids - speaker_manager.set_speaker_ids_from_file(speakers_file) + speaker_ids_from_data = speaker_manager.ids + speaker_manager.load_ids_from_file(speakers_file) assert all( - speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data + speaker in speaker_manager.ids for speaker in speaker_ids_from_data ), " [!] You cannot introduce new speakers to a pre-trained model." elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. - speaker_manager.set_d_vectors_from_file(c.d_vector_file) + speaker_manager.load_embeddings_from_file(c.d_vector_file) elif c.use_d_vector_file and not c.d_vector_file: raise "use_d_vector_file is True, so you need pass a external speaker embedding file." elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file: # new speaker manager with speaker IDs file. - speaker_manager.set_speaker_ids_from_file(c.speakers_file) + speaker_manager.load_ids_from_file(c.speakers_file) if speaker_manager.num_speakers > 0: print( " > Speaker manager is loaded with {} speakers: {}".format( - speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + speaker_manager.num_speakers, ", ".join(speaker_manager.ids) ) ) @@ -461,9 +207,9 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_file_path = os.path.join(out_path, "speakers.json") print(f" > Saving `speakers.json` to {out_file_path}.") if c.use_d_vector_file and c.d_vector_file: - speaker_manager.save_d_vectors_to_file(out_file_path) + speaker_manager.save_embeddings_to_file(out_file_path) else: - speaker_manager.save_speaker_ids_to_file(out_file_path) + speaker_manager.save_ids_to_file(out_file_path) return speaker_manager diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index eef4086c..1a49f0b0 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -122,7 +122,7 @@ class Synthesizer(object): self.tts_model.cuda() if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): - self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config) + self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config) def _set_speaker_encoder_paths_from_tts_config(self): """Set the encoder paths from the tts model config for models with speaker encoders.""" @@ -212,17 +212,17 @@ class Synthesizer(object): # handle multi-speaker speaker_embedding = None speaker_id = None - if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): + if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"): if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: # get the average speaker embedding from the saved d_vectors. - speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector( + speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( speaker_name, num_samples=None, randomize=False ) speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name - speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_name] + speaker_id = self.tts_model.speaker_manager.ids[speaker_name] elif not speaker_name and not speaker_wav: raise ValueError( @@ -244,7 +244,7 @@ class Synthesizer(object): hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None ): if language_name and isinstance(language_name, str): - language_id = self.tts_model.language_manager.language_id_mapping[language_name] + language_id = self.tts_model.language_manager.ids[language_name] elif not language_name: raise ValueError( @@ -260,7 +260,7 @@ class Synthesizer(object): # compute a new d_vector from the given clip. if speaker_wav is not None: - speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(speaker_wav) + speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) use_gl = self.vocoder_model is None @@ -319,7 +319,7 @@ class Synthesizer(object): if reference_speaker_name and isinstance(reference_speaker_name, str): if self.tts_config.use_d_vector_file: # get the speaker embedding from the saved d_vectors. - reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker( + reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name( reference_speaker_name )[0] reference_speaker_embedding = np.array(reference_speaker_embedding)[ @@ -327,9 +327,9 @@ class Synthesizer(object): ] # [1 x embedding_dim] else: # get speaker idx from the speaker name - reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name] + reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name] else: - reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip( + reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip( reference_wav ) diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index 94692f00..0e650ade 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -119,7 +119,7 @@ train_samples, eval_samples = load_tts_samples( # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers language_manager = LanguageManager(config=config) diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py index 05cdc72a..c39932da 100644 --- a/recipes/vctk/fast_pitch/train_fast_pitch.py +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -81,7 +81,7 @@ train_samples, eval_samples = load_tts_samples( # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers # init model diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py index a294272a..a3249de1 100644 --- a/recipes/vctk/fast_speech/train_fast_speech.py +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -79,7 +79,7 @@ train_samples, eval_samples = load_tts_samples( # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers # init model diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py index 0bf686b1..23c02efc 100644 --- a/recipes/vctk/glow_tts/train_glow_tts.py +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -79,7 +79,7 @@ train_samples, eval_samples = load_tts_samples( # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.num_speakers = speaker_manager.num_speakers # init model diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py index 4208a9b6..bcd0105a 100644 --- a/recipes/vctk/speedy_speech/train_speedy_speech.py +++ b/recipes/vctk/speedy_speech/train_speedy_speech.py @@ -79,7 +79,7 @@ train_samples, eval_samples = load_tts_samples( # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers # init model diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py index d67038a4..36e28ed7 100644 --- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py +++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py @@ -82,7 +82,7 @@ train_samples, eval_samples = load_tts_samples( # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") # init model model = Tacotron(config, ap, tokenizer, speaker_manager) diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py index b860df85..d04d91c0 100644 --- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -88,7 +88,7 @@ train_samples, eval_samples = load_tts_samples( # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") # init model model = Tacotron2(config, ap, tokenizer, speaker_manager) diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py index d27dd78c..5a0e157a 100644 --- a/recipes/vctk/tacotron2/train_tacotron2.py +++ b/recipes/vctk/tacotron2/train_tacotron2.py @@ -88,7 +88,7 @@ train_samples, eval_samples = load_tts_samples( # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") # init model model = Tacotron2(config, ap, tokenizer, speaker_manager) diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index 61d60ca1..88fd7de9 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -89,7 +89,7 @@ train_samples, eval_samples = load_tts_samples( # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers # init model diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py index 57ff6c50..7552e0a5 100644 --- a/tests/aux_tests/test_speaker_manager.py +++ b/tests/aux_tests/test_speaker_manager.py @@ -6,7 +6,7 @@ import torch from tests import get_tests_input_path from TTS.config import load_config -from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model +from TTS.encoder.utils.generic_utils import setup_encoder_model from TTS.encoder.utils.io import save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor @@ -28,7 +28,7 @@ class SpeakerManagerTest(unittest.TestCase): config.audio.resample = True # create a dummy speaker encoder - model = setup_speaker_encoder_model(config) + model = setup_encoder_model(config) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder @@ -38,19 +38,19 @@ class SpeakerManagerTest(unittest.TestCase): # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) - d_vector = manager.compute_d_vector(mel) + d_vector = manager.compute_embeddings(mel) assert d_vector.shape[1] == 256 # compute d_vector directly from an input file - d_vector = manager.compute_d_vector_from_clip(sample_wav_path) - d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector = manager.compute_embedding_from_clip(sample_wav_path) + d_vector2 = manager.compute_embedding_from_clip(sample_wav_path) d_vector = torch.FloatTensor(d_vector) d_vector2 = torch.FloatTensor(d_vector2) assert d_vector.shape[0] == 256 assert (d_vector - d_vector2).sum() == 0.0 # compute d_vector from a list of wav files. - d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2]) + d_vector3 = manager.compute_embedding_from_clip([sample_wav_path, sample_wav_path2]) d_vector3 = torch.FloatTensor(d_vector3) assert d_vector3.shape[0] == 256 assert (d_vector - d_vector3).sum() != 0.0 @@ -62,14 +62,14 @@ class SpeakerManagerTest(unittest.TestCase): def test_speakers_file_processing(): manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path) print(manager.num_speakers) - print(manager.d_vector_dim) + print(manager.embedding_dim) print(manager.clip_ids) - d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0]) + d_vector = manager.get_embedding_by_clip(manager.clip_ids[0]) assert len(d_vector) == 256 - d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_names[0]) + d_vectors = manager.get_embeddings_by_name(manager.speaker_names[0]) assert len(d_vectors[0]) == 256 - d_vector1 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=True) + d_vector1 = manager.get_mean_embedding(manager.speaker_names[0], num_samples=2, randomize=True) assert len(d_vector1) == 256 - d_vector2 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=False) + d_vector2 = manager.get_mean_embedding(manager.speaker_names[0], num_samples=2, randomize=False) assert len(d_vector2) == 256 assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0 diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index 2783e4bd..2a723f10 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -86,7 +86,7 @@ class TestGlowTTS(unittest.TestCase): model = GlowTTS(config) model.speaker_manager = speaker_manager model.init_multispeaker(config) - self.assertEqual(model.c_in_channels, speaker_manager.d_vector_dim) + self.assertEqual(model.c_in_channels, speaker_manager.embedding_dim) self.assertEqual(model.num_speakers, speaker_manager.num_speakers) def test_unlock_act_norm_layers(self): diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 05adb9ed..de683c81 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -7,7 +7,7 @@ from trainer.logging.tensorboard_logger import TensorboardLogger from tests import assertHasAttr, assertHasNotAttr, get_tests_data_path, get_tests_input_path, get_tests_output_path from TTS.config import load_config -from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model +from TTS.encoder.utils.generic_utils import setup_encoder_model from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.models.vits import Vits, VitsArgs, amp_to_db, db_to_amp, load_audio, spec_to_mel, wav_to_mel, wav_to_spec from TTS.tts.utils.speakers import SpeakerManager @@ -242,9 +242,9 @@ class TestVits(unittest.TestCase): speaker_encoder_config = load_config(SPEAKER_ENCODER_CONFIG) speaker_encoder_config.model_params["use_torch_spec"] = True - speaker_encoder = setup_speaker_encoder_model(speaker_encoder_config).to(device) + speaker_encoder = setup_encoder_model(speaker_encoder_config).to(device) speaker_manager = SpeakerManager() - speaker_manager.speaker_encoder = speaker_encoder + speaker_manager.encoder = speaker_encoder args = VitsArgs( language_ids_file=LANG_FILE, diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 63d9e7ca..e614ce74 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -38,7 +38,7 @@ def test_run_all_models(): language_manager = LanguageManager(language_ids_file_path=language_files[0]) language_id = language_manager.language_names[0] - speaker_id = list(speaker_manager.speaker_ids.keys())[0] + speaker_id = list(speaker_manager.ids.keys())[0] run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" ' From 164c7dd67618792bfcb3a5605ed222f74b539001 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 8 Apr 2022 14:47:09 +0200 Subject: [PATCH 31/38] Update requirements coqui_trainer -> trainer (#1478) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index db47c2cc..50c0d2ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,7 @@ matplotlib tensorboardX pyworld # coqui stack -coqui-trainer +trainer coqpit # config management # chinese g2p deps jieba From 27fcb5dabf265e74ee463c3fc11c4709ca5e7b25 Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Fri, 15 Apr 2022 01:13:32 +0200 Subject: [PATCH 32/38] Add Dockerfile and build/push CI --- .dockerignore | 3 +- .github/workflows/docker.yaml | 56 +++++++++++++++++++++++++++++++++++ Dockerfile | 11 +++++++ 3 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/docker.yaml create mode 100644 Dockerfile diff --git a/.dockerignore b/.dockerignore index 4032ec6b..2833d344 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,2 @@ -.git/ \ No newline at end of file +.git/ +Dockerfile diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml new file mode 100644 index 00000000..457649a2 --- /dev/null +++ b/.github/workflows/docker.yaml @@ -0,0 +1,56 @@ +name: "Docker build and push" +on: + pull_request: + push: + branches: + - main + - dev + tags: + - v* +jobs: + docker-build: + name: "Build and push Docker image" + runs-on: ubuntu-20.04 + strategy: + matrix: + arch: ["amd64"] + steps: + - uses: actions/checkout@v2 + - name: Log in to the Container registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Compute Docker tags, check VERSION file matches tag + id: compute-tag + run: | + set -ex + base="ghcr.io/coqui-ai/tts" + tags="" # PR build + if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then + # Push to branch + github_ref="${{ github.ref }}" + branch=${github_ref#*refs/heads/} # strip prefix to get branch name + tags="${base}:${branch},${base}:${{ github.sha }}," + elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then + VERSION="v$(cat TTS/VERSION)" + if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then + echo "Pushed tag does not match VERSION file. Aborting push." + exit 1 + fi + tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}" + fi + echo "::set-output name=tags::${tags}" + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + - name: Build and push + uses: docker/build-push-action@v2 + with: + context: . + platforms: linux/${{ matrix.arch }} + push: ${{ github.event_name == 'push' }} + tags: ${{ steps.compute-tag.outputs.tags }} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..8dab3b30 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM nvcr.io/nvidia/pytorch:22.03-py3 +RUN apt-get update && apt-get install -y --no-install-recommends espeak && rm -rf /var/lib/apt/lists/* +WORKDIR /root +COPY requirements.txt /root +COPY requirements.dev.txt /root +COPY requirements.notebooks.txt /root +RUN pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt) +COPY . /root +RUN make install +ENTRYPOINT ["tts"] +CMD ["--help"] From e8573bfe3e692613920a0199c984deed7f0d9cfe Mon Sep 17 00:00:00 2001 From: jackiexiao <707610215@qq.com> Date: Fri, 15 Apr 2022 20:43:46 +0800 Subject: [PATCH 33/38] Update CONTRIBUTING.md (#1463) fix header ``` ## Call for sharing language models ``` --- CONTRIBUTING.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7175cf34..81a426e8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,7 +26,8 @@ If you like to contribute code, squash a bug but if you don't know where to star We list all the target improvements for the next version. You can pick one of them and start contributing. - Also feel free to suggest new features, ideas and models. We're always open for new things. -#####Call for sharing language models + +## Call for sharing language models If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified. This model can be shared in two ways: @@ -36,6 +37,7 @@ This model can be shared in two ways: Models are served under `.models.json` file and any model is available under TTS CLI or Server end points. Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380). + ## Sending a ✨**PR**✨ If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨. From 4953636b1466a5e9fd5e73aa9afeaaeea8bb19dd Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 19 Apr 2022 14:18:30 +0200 Subject: [PATCH 34/38] Add African models (#1511) * Add african models * Set default license for all models --- TTS/.models.json | 112 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 99 insertions(+), 13 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 24838a5d..4870bc1f 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -18,7 +18,8 @@ "description": "EK1 en-rp tacotron2 by NMStoker", "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip", "default_vocoder": "vocoder_models/en/ek1/wavegrad", - "commit": "c802255" + "commit": "c802255", + "license": "apache 2.0" } }, "ljspeech": { @@ -28,7 +29,7 @@ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "bae2ad0f", "author": "Eren Gölge @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.com" }, "tacotron2-DDC_ph": { @@ -37,7 +38,7 @@ "default_vocoder": "vocoder_models/en/ljspeech/univnet", "commit": "3900448", "author": "Eren Gölge @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.com" }, "glow-tts": { @@ -57,7 +58,7 @@ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "4581e3d", "author": "Eren Gölge @erogol", - "license": "TBD", + "license": "apache 2.0", "contact": "egolge@coqui.com" }, "tacotron2-DCA": { @@ -75,7 +76,7 @@ "default_vocoder": null, "commit": "3900448", "author": "Eren Gölge @erogol", - "license": "TBD", + "license": "apache 2.0", "contact": "egolge@coqui.com" }, "fast_pitch": { @@ -84,7 +85,7 @@ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "b27b3ba", "author": "Eren Gölge @erogol", - "license": "TBD", + "license": "apache 2.0", "contact": "egolge@coqui.com" } }, @@ -95,7 +96,7 @@ "default_vocoder": null, "commit": "3900448", "author": "Eren @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.ai" }, "fast_pitch":{ @@ -115,7 +116,7 @@ "default_vocoder": "vocoder_models/en/sam/hifigan_v2", "commit": "bae2ad0f", "author": "Eren Gölge @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.com" } } @@ -162,6 +163,7 @@ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", "commit": "unknown", "author": "@kirianguiller", + "license": "apache 2.0", "default_vocoder": null } } @@ -171,6 +173,7 @@ "tacotron2-DDC": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip", "author": "@r-dh", + "license": "apache 2.0", "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan", "stats_file": null, "commit": "540d811" @@ -183,6 +186,7 @@ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip", "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan", "author": "@thorstenMueller", + "license": "apache 2.0", "commit": "unknown" } } @@ -194,6 +198,7 @@ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1", "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.", "author": "@kaiidams", + "license": "apache 2.0", "commit": "401fbd89" } } @@ -217,6 +222,7 @@ "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", + "license": "apache 2.0", "commit": null }, "vits":{ @@ -224,6 +230,7 @@ "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", + "license": "apache 2.0", "commit": null } }, @@ -233,6 +240,7 @@ "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", + "license": "apache 2.0", "commit": null }, "vits":{ @@ -240,9 +248,82 @@ "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", + "license": "apache 2.0", "commit": null } } + }, + "ewe": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "hau": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "lin": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "tw_akuapem": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "tw_asante": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "yor": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } } }, "vocoder_models": { @@ -269,7 +350,8 @@ "wavegrad": { "description": "EK1 en-rp wavegrad by NMStoker", "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip", - "commit": "c802255" + "commit": "c802255", + "license": "apache 2.0" } }, "ljspeech": { @@ -285,7 +367,7 @@ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip", "commit": "bae2ad0f", "author": "@erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.ai" }, "univnet": { @@ -293,7 +375,7 @@ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip", "commit": "4581e3d", "author": "Eren @erogol", - "license": "TBD", + "license": "apache 2.0", "contact": "egolge@coqui.ai" } }, @@ -303,7 +385,7 @@ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip", "commit": "2f07160", "author": "Edresson Casanova", - "license": "", + "license": "apache 2.0", "contact": "" } }, @@ -313,7 +395,7 @@ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip", "commit": "2f07160", "author": "Eren Gölge @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.ai" } } @@ -323,6 +405,7 @@ "parallel-wavegan": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip", "author": "@r-dh", + "license": "apache 2.0", "commit": "unknown" } } @@ -332,11 +415,13 @@ "wavegrad": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip", "author": "@thorstenMueller", + "license": "apache 2.0", "commit": "unknown" }, "fullband-melgan": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip", "author": "@thorstenMueller", + "license": "apache 2.0", "commit": "unknown" } } @@ -347,6 +432,7 @@ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip", "description": "HifiGAN model trained for kokoro dataset by @kaiidams", "author": "@kaiidams", + "license": "apache 2.0", "commit": "3900448" } } From 7133f8f47d6c0ed0ce4c3beefeb8112ce94d7f6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 19 Apr 2022 14:18:49 +0200 Subject: [PATCH 35/38] Print Model's license when downloading (#1512) * Print model license while downloading * Make style * Add a new license link * Make style --- TTS/utils/manage.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 674d5a47..0ef3675b 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -4,13 +4,23 @@ import os import zipfile from pathlib import Path from shutil import copyfile, rmtree -from typing import Tuple +from typing import Dict, Tuple import requests from TTS.config import load_config from TTS.utils.generic_utils import get_user_data_dir +LICENSE_URLS = { + "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/", + "mpl": "https://www.mozilla.org/en-US/MPL/2.0/", + "mpl2": "https://www.mozilla.org/en-US/MPL/2.0/", + "mit": "https://choosealicense.com/licenses/mit/", + "apache2.0": "https://choosealicense.com/licenses/apache-2.0/", + "apache2": "https://choosealicense.com/licenses/apache-2.0/", + "cc-by-sa 4.0": "https://creativecommons.org/licenses/by-sa/4.0/", +} + class ModelManager(object): """Manage TTS models defined in .models.json. @@ -108,6 +118,22 @@ class ModelManager(object): for dataset in self.models_dict[model_type][lang]: print(f" >: {model_type}/{lang}/{dataset}") + @staticmethod + def print_model_license(model_item: Dict): + """Print the license of a model + + Args: + model_item (dict): model item in the models.json + """ + if "license" in model_item and model_item["license"].strip() != "": + print(f" > Model's license - {model_item['license']}") + if model_item["license"].lower() in LICENSE_URLS: + print(f" > Check {LICENSE_URLS[model_item['license'].lower()]} for more info.") + else: + print(" > Check https://opensource.org/licenses for more info.") + else: + print(" > Model's license - No license information available") + def download_model(self, model_name): """Download model files given the full model name. Model name is in the format @@ -135,6 +161,7 @@ class ModelManager(object): print(f" > Downloading model to {output_path}") # download from github release self._download_zip_file(model_item["github_rls_url"], output_path) + self.print_model_license(model_item=model_item) # find downloaded files output_model_path, output_config_path = self._find_files(output_path) # update paths in the config.json From b45d5c5c60fc4399af67f2281fb92667de7b0b57 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Tue, 19 Apr 2022 20:24:34 +0800 Subject: [PATCH 36/38] Improve docsQA default questions (#1411) --- docs/source/_templates/page.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/_templates/page.html b/docs/source/_templates/page.html index aab3d977..2c6ef4ee 100644 --- a/docs/source/_templates/page.html +++ b/docs/source/_templates/page.html @@ -13,9 +13,9 @@ From 30bea7d53cacffb8732dcdf51b053952005aea1d Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Tue, 19 Apr 2022 14:27:32 +0200 Subject: [PATCH 37/38] Update manage.py (#1514) --- TTS/utils/manage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 0ef3675b..87cb5592 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -15,8 +15,9 @@ LICENSE_URLS = { "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/", "mpl": "https://www.mozilla.org/en-US/MPL/2.0/", "mpl2": "https://www.mozilla.org/en-US/MPL/2.0/", + "mpl 2.0": "https://www.mozilla.org/en-US/MPL/2.0/", "mit": "https://choosealicense.com/licenses/mit/", - "apache2.0": "https://choosealicense.com/licenses/apache-2.0/", + "apache 2.0": "https://choosealicense.com/licenses/apache-2.0/", "apache2": "https://choosealicense.com/licenses/apache-2.0/", "cc-by-sa 4.0": "https://creativecommons.org/licenses/by-sa/4.0/", } From c410bc58ef3bd07b72ab05d29bbdc2a6df47afea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 20 Apr 2022 11:46:26 +0200 Subject: [PATCH 38/38] Bump to v0.6.2 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 7ceb0404..b1d7abc0 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.6.1 \ No newline at end of file +0.6.2 \ No newline at end of file