From c670365507e55ec857505792358d4d43b063fe95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 8 Mar 2022 14:20:14 +0100
Subject: [PATCH 01/38] Fix VCTK recipe and formatter

---
 TTS/tts/datasets/formatters.py  | 4 ++--
 recipes/vctk/vits/train_vits.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index aacfc647..fa8d79bc 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -328,7 +328,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
         else:
             wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}")
         if os.path.exists(wav_file):
-            items.append([text, wav_file, "VCTK_" + speaker_id])
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id})
         else:
             print(f" [!] wav files don't exist - {wav_file}")
     return items
@@ -348,7 +348,7 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48"):
         with open(meta_file, "r", encoding="utf-8") as file_text:
             text = file_text.readlines()[0]
         wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
-        items.append([text, wav_file, "VCTK_old_" + speaker_id])
+        items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id})
     return items
 
 
diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py
index dff4eefc..84e8a058 100644
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@@ -53,6 +53,7 @@ config = VitsConfig(
     epochs=1000,
     text_cleaner="english_cleaners",
     use_phonemes=True,
+    phoneme_language="en",
     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
     compute_input_seq_cache=True,
     print_step=25,

From d792b78703b644bdf1f1a06686df73d3da7b90a8 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Wed, 9 Mar 2022 12:18:17 -0300
Subject: [PATCH 02/38] Fix multilingual recipe (#1354)

---
 .../multilingual/vits_tts/train_vits_tts.py   | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py
index ac2c21a2..3f35275a 100644
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@@ -6,9 +6,11 @@ from trainer import Trainer, TrainerArgs
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.models.vits import CharactersConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.vits import Vits, VitsArgs
 from TTS.tts.utils.languages import LanguageManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
 
@@ -73,15 +75,16 @@ config = VitsConfig(
     max_audio_len=160000,
     output_path=output_path,
     datasets=dataset_config,
-    characters={
-        "pad": "_",
-        "eos": "&",
-        "bos": "*",
-        "characters": "!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„",
-        "punctuations": "!¡'(),-.:;¿? ",
-        "phonemes": None,
-        "unique": True,
-    },
+    characters=CharactersConfig(
+        characters_class="TTS.tts.models.vits.VitsCharacters",
+        pad="<PAD>",
+        eos="<EOS>",
+        bos="<BOS>",
+        blank="<BLNK>",
+        characters="!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„",
+        punctuations="!¡'(),-.:;¿? ",
+        phonemes=None,
+    ),
     test_sentences=[
         [
             "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
@@ -100,6 +103,9 @@ config = VitsConfig(
     ],
 )
 
+# force the convertion of the custom characters to a config attribute
+config.from_dict(config.to_dict())
+
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())
 
@@ -115,8 +121,13 @@ config.model_args.num_speakers = speaker_manager.num_speakers
 language_manager = LanguageManager(config=config)
 config.model_args.num_languages = language_manager.num_languages
 
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# config is updated with the default characters if not defined in the config.
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
 # init model
-model = Vits(config, speaker_manager, language_manager)
+model = Vits(config, ap, tokenizer, speaker_manager, language_manager)
 
 # init the trainer and 🚀
 trainer = Trainer(

From 48f6bb405ac90295368ec53329e87055fbf809bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Thu, 10 Mar 2022 11:36:38 +0100
Subject: [PATCH 03/38] Fix recipes as to the recent API changes.  (#1367)

* Fix recipes -> #1366

* Fix trainer docs
---
 docs/source/main_classes/trainer_api.md          | 16 +---------------
 recipes/ljspeech/hifigan/train_hifigan.py        |  5 ++---
 .../multiband_melgan/train_multiband_melgan.py   |  5 ++---
 .../ljspeech/tacotron2-DDC/train_tacotron_ddc.py |  3 +--
 recipes/ljspeech/univnet/train.py                |  5 ++---
 recipes/multilingual/vits_tts/train_vits_tts.py  |  3 +--
 6 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md
index a5c3cfb7..f765fff7 100644
--- a/docs/source/main_classes/trainer_api.md
+++ b/docs/source/main_classes/trainer_api.md
@@ -1,17 +1,3 @@
 # Trainer API
 
-The {class}`TTS.trainer.Trainer` provides a lightweight, extensible, and feature-complete training run-time. We optimized it for 🐸 but
-can also be used for any DL training in different domains. It supports distributed multi-gpu, mixed-precision (apex or torch.amp) training.
-
-
-## Trainer
-```{eval-rst}
-.. autoclass:: TTS.trainer.Trainer
-    :members:
-```
-
-## TrainingArgs
-```{eval-rst}
-.. autoclass:: TTS.trainer.TrainingArgs
-    :members:
-```
\ No newline at end of file
+We made the trainer a seprate project on https://github.com/coqui-ai/Trainer
diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py
index 1e5bbf30..6a739009 100644
--- a/recipes/ljspeech/hifigan/train_hifigan.py
+++ b/recipes/ljspeech/hifigan/train_hifigan.py
@@ -37,7 +37,7 @@ ap = AudioProcessor(**config.audio.to_dict())
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
 
 # init model
-model = GAN(config)
+model = GAN(config, ap)
 
 # init the trainer and 🚀
 trainer = Trainer(
@@ -46,7 +46,6 @@ trainer = Trainer(
     output_path,
     model=model,
     train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
index 40ff5a00..d5ca9a76 100644
--- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
+++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
@@ -37,7 +37,7 @@ ap = AudioProcessor(**config.audio.to_dict())
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
 
 # init model
-model = GAN(config)
+model = GAN(config, ap)
 
 # init the trainer and 🚀
 trainer = Trainer(
@@ -46,7 +46,6 @@ trainer = Trainer(
     output_path,
     model=model,
     train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
index d00f8ed7..a0ff8b02 100644
--- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
@@ -89,7 +89,6 @@ trainer = Trainer(
     output_path,
     model=model,
     train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py
index 19c91925..592b9a76 100644
--- a/recipes/ljspeech/univnet/train.py
+++ b/recipes/ljspeech/univnet/train.py
@@ -36,7 +36,7 @@ ap = AudioProcessor(**config.audio.to_dict())
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
 
 # init model
-model = GAN(config)
+model = GAN(config, ap)
 
 # init the trainer and 🚀
 trainer = Trainer(
@@ -45,7 +45,6 @@ trainer = Trainer(
     output_path,
     model=model,
     train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py
index 3f35275a..c4ed0dda 100644
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@@ -136,7 +136,6 @@ trainer = Trainer(
     output_path,
     model=model,
     train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    eval_samples=eval_samples
 )
 trainer.fit()

From 8a007c8834a6212513e6757a04b29558c534ddf5 Mon Sep 17 00:00:00 2001
From: Yanlong Wang <yanlong.wang@naiver.org>
Date: Thu, 10 Mar 2022 18:40:06 +0800
Subject: [PATCH 04/38] feat: add docsqa to docs website (#1363)

---
 docs/source/_templates/page.html | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 docs/source/_templates/page.html

diff --git a/docs/source/_templates/page.html b/docs/source/_templates/page.html
new file mode 100644
index 00000000..b86c33e7
--- /dev/null
+++ b/docs/source/_templates/page.html
@@ -0,0 +1,23 @@
+{% extends "!page.html" %}
+{% block scripts %}
+    {{ super() }}
+    <!-- DocsQA integration start -->
+    <script src="https://cdn.jsdelivr.net/npm/qabot@0.4"></script>
+        
+    <qa-bot 
+        token="qAFjWNovwHUXKKkVhy4AN6tawSwCMfdb3HJNPLVM23ACdrBGxmBNObM="
+        title="🐸 STT Bot"
+        description="A library for advanced Text-to-Speech generation"
+        style="bottom: calc(1.25em + 80px);"
+    >
+    <template>
+        <dl>
+            <dt>You can ask questions about TTS. Try</dt>
+            <dd>how to install TTS for Python?</dd>
+            <dd>What data format for the training dataset?</dd>
+            <dd>what makes a good dataset?</dd>
+        </dl>
+    </template>
+    </qa-bot>
+    <!-- DocsQA integration end -->
+{% endblock %}

From 07d96f7991b99ae74e6b2bfe167d0bfa1753bf64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 10 Mar 2022 12:17:06 +0100
Subject: [PATCH 05/38] Fix DocQA title

---
 docs/source/_templates/page.html | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/_templates/page.html b/docs/source/_templates/page.html
index b86c33e7..aab3d977 100644
--- a/docs/source/_templates/page.html
+++ b/docs/source/_templates/page.html
@@ -3,10 +3,10 @@
     {{ super() }}
     <!-- DocsQA integration start -->
     <script src="https://cdn.jsdelivr.net/npm/qabot@0.4"></script>
-        
-    <qa-bot 
+
+    <qa-bot
         token="qAFjWNovwHUXKKkVhy4AN6tawSwCMfdb3HJNPLVM23ACdrBGxmBNObM="
-        title="🐸 STT Bot"
+        title="🐸💬TTS Bot"
         description="A library for advanced Text-to-Speech generation"
         style="bottom: calc(1.25em + 80px);"
     >

From f381e29b912ba85732bde026a011b74b7731aa0f Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Thu, 10 Mar 2022 10:54:51 -0300
Subject: [PATCH 06/38] REBASED: Add support for the speaker encoder training
 using torch spectrograms  (#1348)

* Add support for the speaker encoder training using torch spectrograms

* Remove useless function in speaker encoder dataset class
---
 TTS/bin/train_encoder.py       |  1 +
 TTS/speaker_encoder/dataset.py | 26 ++++++++------------------
 TTS/tts/datasets/formatters.py | 10 +++++-----
 3 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index 5828411c..b7424698 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -46,6 +46,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
             sample_from_storage_p=c.storage["sample_from_storage_p"],
             verbose=verbose,
             augmentation_config=c.audio_augmentation,
+            use_torch_spec=c.model_params.get("use_torch_spec", False),
         )
 
         # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py
index 28a23e2f..07fa9246 100644
--- a/TTS/speaker_encoder/dataset.py
+++ b/TTS/speaker_encoder/dataset.py
@@ -20,6 +20,7 @@ class SpeakerEncoderDataset(Dataset):
         skip_speakers=False,
         verbose=False,
         augmentation_config=None,
+        use_torch_spec=None,
     ):
         """
         Args:
@@ -37,6 +38,7 @@ class SpeakerEncoderDataset(Dataset):
         self.skip_speakers = skip_speakers
         self.ap = ap
         self.verbose = verbose
+        self.use_torch_spec = use_torch_spec
         self.__parse_items()
         storage_max_size = storage_size * num_speakers_in_batch
         self.storage = Storage(
@@ -72,22 +74,6 @@ class SpeakerEncoderDataset(Dataset):
         audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
         return audio
 
-    def load_data(self, idx):
-        text, wav_file, speaker_name = self.items[idx]
-        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
-        mel = self.ap.melspectrogram(wav).astype("float32")
-        # sample seq_len
-
-        assert text.size > 0, self.items[idx]["audio_file"]
-        assert wav.size > 0, self.items[idx]["audio_file"]
-
-        sample = {
-            "mel": mel,
-            "item_idx": self.items[idx]["audio_file"],
-            "speaker_name": speaker_name,
-        }
-        return sample
-
     def __parse_items(self):
         self.speaker_to_utters = {}
         for i in self.items:
@@ -241,8 +227,12 @@ class SpeakerEncoderDataset(Dataset):
                             self.gaussian_augmentation_config["max_amplitude"],
                             size=len(wav),
                         )
-                mel = self.ap.melspectrogram(wav)
-                feats_.append(torch.FloatTensor(mel))
+
+                if not self.use_torch_spec:
+                    mel = self.ap.melspectrogram(wav)
+                    feats_.append(torch.FloatTensor(mel))
+                else:
+                    feats_.append(torch.FloatTensor(wav))
 
             labels.append(torch.LongTensor(labels_))
             feats.extend(feats_)
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index fa8d79bc..ac3080c3 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -334,21 +334,21 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
     return items
 
 
-def vctk_old(root_path, meta_files=None, wavs_path="wav48"):
+def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):
     """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
-    test_speakers = meta_files
     items = []
     meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
     for meta_file in meta_files:
         _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
         file_id = txt_file.split(".")[0]
-        if isinstance(test_speakers, list):  # if is list ignore this speakers ids
-            if speaker_id in test_speakers:
+        # ignore speakers
+        if isinstance(ignored_speakers, list):
+            if speaker_id in ignored_speakers:
                 continue
         with open(meta_file, "r", encoding="utf-8") as file_text:
             text = file_text.readlines()[0]
         wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
-        items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id})
+        items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id})
     return items
 
 

From 917f417ac46e8e5da2e37f4a4b3c555fb97e3b16 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Thu, 10 Mar 2022 10:56:09 -0300
Subject: [PATCH 07/38] Add alphas to control language and speaker balancer
 (#1216)

* Add alphas to control language and speaker balancer

* Add docs for speaker and language samplers

* Change the Samplers weights to float for save memory

* Change the test_samplers to unittest format

* Add get_sampler method in BaseTTS

* Fix rebase issues

* Add language and speaker samplers support for DDP training

* Rename distributed sampler wrapper

* Remove the DistributedSamplerWrapper and use the one from Trainer

* Bugfix after rebase

* Move the samplers config to tts config
---
 TTS/config/shared_configs.py                  |  1 -
 TTS/tts/configs/shared_configs.py             | 17 +++++
 TTS/tts/models/base_tts.py                    | 58 +++++++++-------
 TTS/tts/models/vits.py                        | 27 ++------
 TTS/tts/utils/languages.py                    | 10 +--
 TTS/tts/utils/speakers.py                     |  9 +--
 tests/data_tests/test_samplers.py             | 66 +++++++++++++------
 .../test_vits_multilingual_train-d_vectors.py |  9 ++-
 8 files changed, 121 insertions(+), 76 deletions(-)

diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py
index 6394b264..3ea49796 100644
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@@ -258,4 +258,3 @@ class BaseTrainingConfig(TrainerConfig):
     num_loader_workers: int = 0
     num_eval_loader_workers: int = 0
     use_noise_augment: bool = False
-    use_language_weighted_sampler: bool = False
diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index f43c6464..a9b56ed4 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -220,6 +220,18 @@ class BaseTTSConfig(BaseTrainingConfig):
         eval_split_size (float):
             If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
             If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
+
+        use_speaker_weighted_sampler (bool):
+            Enable / Disable the batch balancer by speaker. Defaults to ```False```.
+
+        speaker_weighted_sampler_alpha (float):
+            Number that control the influence of the speaker sampler weights. Defaults to ```1.0```.
+
+        use_language_weighted_sampler (bool):
+            Enable / Disable the batch balancer by language. Defaults to ```False```.
+
+        language_weighted_sampler_alpha (float):
+            Number that control the influence of the language sampler weights. Defaults to ```1.0```.
     """
 
     audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
@@ -262,3 +274,8 @@ class BaseTTSConfig(BaseTrainingConfig):
     # evaluation
     eval_split_max_size: int = None
     eval_split_size: float = 0.01
+    # weighted samplers
+    use_speaker_weighted_sampler: bool = False
+    speaker_weighted_sampler_alpha: float = 1.0
+    use_language_weighted_sampler: bool = False
+    language_weighted_sampler_alpha: float = 1.0
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 4e54b947..222f8519 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -7,14 +7,15 @@ import torch.distributed as dist
 from coqpit import Coqpit
 from torch import nn
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
+from trainer.torch import DistributedSampler, DistributedSamplerWrapper
 
 from TTS.model import BaseTrainerModel
 from TTS.tts.datasets.dataset import TTSDataset
-from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler
-from TTS.tts.utils.speakers import SpeakerManager, get_speaker_weighted_sampler
+from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
+from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_balancer_weights
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from torch.utils.data.sampler import WeightedRandomSampler
 
 # pylint: skip-file
 
@@ -232,6 +233,36 @@ class BaseTTS(BaseTrainerModel):
             "language_ids": language_ids,
         }
 
+    def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
+        weights = None
+        data_items = dataset.samples
+
+        if getattr(config, "use_language_weighted_sampler", False):
+            alpha = getattr(config, "language_weighted_sampler_alpha", 1.0)
+            print(" > Using Language weighted sampler with alpha:", alpha)
+            weights = get_language_balancer_weights(data_items) * alpha
+
+        if getattr(config, "use_speaker_weighted_sampler", False):
+            alpha = getattr(config, "speaker_weighted_sampler_alpha", 1.0)
+            print(" > Using Speaker weighted sampler with alpha:", alpha)
+            if weights is not None:
+                weights += get_speaker_balancer_weights(data_items) * alpha
+            else:
+                weights = get_speaker_balancer_weights(data_items) * alpha
+
+        if weights is not None:
+            sampler = WeightedRandomSampler(weights, len(weights))
+        else:
+            sampler = None
+
+        # sampler for DDP
+        if sampler is None:
+            sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+        else: # If a sampler is already defined use this sampler and DDP sampler together
+            sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler
+
+        return sampler
+
     def get_data_loader(
         self,
         config: Coqpit,
@@ -300,25 +331,8 @@ class BaseTTS(BaseTrainerModel):
             # sort input sequences from short to long
             dataset.preprocess_samples()
 
-            # sampler for DDP
-            sampler = DistributedSampler(dataset) if num_gpus > 1 else None
-
-            # Weighted samplers
-            # TODO: make this DDP amenable
-            assert not (
-                num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False)
-            ), "language_weighted_sampler is not supported with DistributedSampler"
-            assert not (
-                num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False)
-            ), "speaker_weighted_sampler is not supported with DistributedSampler"
-
-            if sampler is None:
-                if getattr(config, "use_language_weighted_sampler", False):
-                    print(" > Using Language weighted sampler")
-                    sampler = get_language_weighted_sampler(dataset.samples)
-                elif getattr(config, "use_speaker_weighted_sampler", False):
-                    print(" > Using Language weighted sampler")
-                    sampler = get_speaker_weighted_sampler(dataset.samples)
+            # get samplers
+            sampler = self.get_sampler(config, dataset, num_gpus)
 
             loader = DataLoader(
                 dataset,
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index a43e081c..6aa30dfe 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -13,7 +13,6 @@ from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 from torch.nn import functional as F
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
 from trainer.trainer_utils import get_optimizer, get_scheduler
 
 from TTS.tts.configs.shared_configs import CharactersConfig
@@ -24,8 +23,8 @@ from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlock
 from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
-from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler
-from TTS.tts.utils.speakers import SpeakerManager, get_speaker_weighted_sampler
+from TTS.tts.utils.languages import LanguageManager
+from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
@@ -1354,31 +1353,15 @@ class Vits(BaseTTS):
             # sort input sequences from short to long
             dataset.preprocess_samples()
 
-            # sampler for DDP
-            sampler = DistributedSampler(dataset) if num_gpus > 1 else None
-
-            # Weighted samplers
-            # TODO: make this DDP amenable
-            assert not (
-                num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False)
-            ), "language_weighted_sampler is not supported with DistributedSampler"
-            assert not (
-                num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False)
-            ), "speaker_weighted_sampler is not supported with DistributedSampler"
-
-            if sampler is None:
-                if getattr(config, "use_language_weighted_sampler", False):
-                    print(" > Using Language weighted sampler")
-                    sampler = get_language_weighted_sampler(dataset.samples)
-                elif getattr(config, "use_speaker_weighted_sampler", False):
-                    print(" > Using Language weighted sampler")
-                    sampler = get_speaker_weighted_sampler(dataset.samples)
+            # get samplers
+            sampler = self.get_sampler(config, dataset, num_gpus)
 
             loader = DataLoader(
                 dataset,
                 batch_size=config.eval_batch_size if is_eval else config.batch_size,
                 shuffle=False,  # shuffle is done in the dataset.
                 drop_last=False,  # setting this False might cause issues in AMP training.
+                sampler=sampler,
                 collate_fn=dataset.collate_fn,
                 num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
                 pin_memory=False,
diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py
index 19708c13..7decabb0 100644
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@@ -6,7 +6,6 @@ import fsspec
 import numpy as np
 import torch
 from coqpit import Coqpit
-from torch.utils.data.sampler import WeightedRandomSampler
 
 from TTS.config import check_config_and_model_args
 
@@ -128,11 +127,14 @@ def _set_file_path(path):
     return None
 
 
-def get_language_weighted_sampler(items: list):
+def get_language_balancer_weights(items: list):
     language_names = np.array([item["language"] for item in items])
     unique_language_names = np.unique(language_names).tolist()
     language_ids = [unique_language_names.index(l) for l in language_names]
     language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names])
     weight_language = 1.0 / language_count
-    dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double()
-    return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight))
+    # get weight for each sample
+    dataset_samples_weight = np.array([weight_language[l] for l in language_ids])
+    # normalize
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    return torch.from_numpy(dataset_samples_weight).float()
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 99d653e6..078ce3f1 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -7,7 +7,6 @@ import fsspec
 import numpy as np
 import torch
 from coqpit import Coqpit
-from torch.utils.data.sampler import WeightedRandomSampler
 
 from TTS.config import get_from_config_or_model_args_with_default, load_config
 from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model
@@ -449,11 +448,13 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
     return speaker_manager
 
 
-def get_speaker_weighted_sampler(items: list):
+def get_speaker_balancer_weights(items: list):
     speaker_names = np.array([item["speaker_name"] for item in items])
     unique_speaker_names = np.unique(speaker_names).tolist()
     speaker_ids = [unique_speaker_names.index(l) for l in speaker_names]
     speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names])
     weight_speaker = 1.0 / speaker_count
-    dataset_samples_weight = torch.from_numpy(np.array([weight_speaker[l] for l in speaker_ids])).double()
-    return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight))
+    dataset_samples_weight = np.array([weight_speaker[l] for l in speaker_ids])
+    # normalize
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    return torch.from_numpy(dataset_samples_weight).float()
diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py
index 497a3fb5..12152fb8 100644
--- a/tests/data_tests/test_samplers.py
+++ b/tests/data_tests/test_samplers.py
@@ -1,10 +1,13 @@
 import functools
 
+import unittest
+
 import torch
 
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.languages import get_language_weighted_sampler
+from TTS.tts.utils.languages import get_language_balancer_weights
+from TTS.tts.utils.speakers import get_speaker_balancer_weights
 
 # Fixing random state to avoid random fails
 torch.manual_seed(0)
@@ -25,34 +28,57 @@ dataset_config_pt = BaseDatasetConfig(
     language="pt-br",
 )
 
-# Adding the EN samples twice to create an unbalanced dataset
+# Adding the EN samples twice to create a language unbalanced dataset
 train_samples, eval_samples = load_tts_samples(
     [dataset_config_en, dataset_config_en, dataset_config_pt], eval_split=True
 )
 
+# gerenate a speaker unbalanced dataset
+for i, sample in enumerate(train_samples):
+    if i < 5:
+        sample["speaker_name"] = "ljspeech-0"
+    else:
+        sample["speaker_name"] = "ljspeech-1"
+
 
 def is_balanced(lang_1, lang_2):
     return 0.85 < lang_1 / lang_2 < 1.2
 
 
-random_sampler = torch.utils.data.RandomSampler(train_samples)
-ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)])
-en, pt = 0, 0
-for index in ids:
-    if train_samples[index]["language"] == "en":
-        en += 1
-    else:
-        pt += 1
+class TestSamplers(unittest.TestCase):
+    def test_language_random_sampler(self):  # pylint: disable=no-self-use
+        random_sampler = torch.utils.data.RandomSampler(train_samples)
+        ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)])
+        en, pt = 0, 0
+        for index in ids:
+            if train_samples[index]["language"] == "en":
+                en += 1
+            else:
+                pt += 1
 
-assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced"
+        assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced"
 
-weighted_sampler = get_language_weighted_sampler(train_samples)
-ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
-en, pt = 0, 0
-for index in ids:
-    if train_samples[index]["language"] == "en":
-        en += 1
-    else:
-        pt += 1
+    def test_language_weighted_random_sampler(self):  # pylint: disable=no-self-use
+        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_language_balancer_weights(train_samples), len(train_samples))
+        ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
+        en, pt = 0, 0
+        for index in ids:
+            if train_samples[index]["language"] == "en":
+                en += 1
+            else:
+                pt += 1
 
-assert is_balanced(en, pt), "Weighted sampler is supposed to be balanced"
+        assert is_balanced(en, pt), "Language Weighted sampler is supposed to be balanced"
+
+    def test_speaker_weighted_random_sampler(self):  # pylint: disable=no-self-use
+
+        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_speaker_balancer_weights(train_samples), len(train_samples))
+        ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
+        spk1, spk2 = 0, 0
+        for index in ids:
+            if train_samples[index]["speaker_name"] == "ljspeech-0":
+                spk1 += 1
+            else:
+                spk2 += 1
+
+        assert is_balanced(spk1, spk2), "Speaker Weighted sampler is supposed to be balanced"
diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
index a8e2020e..e12661a5 100644
--- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
+++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
@@ -45,7 +45,7 @@ config = VitsConfig(
         ["Be a voice, not an echo.", "ljspeech-0", None, "en"],
         ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"],
     ],
-    datasets=[dataset_config_en, dataset_config_pt],
+    datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt],
 )
 # set audio config
 config.audio.do_trim_silence = True
@@ -71,8 +71,11 @@ config.d_vector_dim = 256
 config.model_args.use_sdp = True
 config.use_sdp = True
 
-# deactivate language sampler
-config.use_language_weighted_sampler = False
+# activate language and speaker samplers
+config.use_language_weighted_sampler = True
+config.language_weighted_sampler_alpha = 10
+config.use_speaker_weighted_sampler = True
+config.speaker_weighted_sampler_alpha = 5
 
 config.save_json(config_path)
 

From dbe9da7f15544b83043f481a99e5bcb23e002dc9 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Thu, 10 Mar 2022 10:57:12 -0300
Subject: [PATCH 08/38] Add Voice conversion inference support (#1337)

* Add support for voice conversion inference

* Cache d_vectors_by_speaker for fast inference using a bigger speakers.json

* Rebase bug fix

* Use the average d-vector for inference
---
 TTS/bin/synthesize.py      |  20 ++++--
 TTS/tts/models/vits.py     |  24 ++++++-
 TTS/tts/utils/speakers.py  |  19 +++++-
 TTS/tts/utils/synthesis.py |  85 +++++++++++++++++++++++
 TTS/utils/synthesizer.py   | 134 ++++++++++++++++++++++++++++---------
 5 files changed, 241 insertions(+), 41 deletions(-)

diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 509b3da6..fe31c510 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -195,11 +195,22 @@ If you don't specify any models, then it uses LJSpeech based English model.
         help="If true save raw spectogram for further (vocoder) processing in out_path.",
         default=False,
     )
-
+    parser.add_argument(
+        "--reference_wav",
+        type=str,
+        help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
+        default=None,
+    )
+    parser.add_argument(
+        "--reference_speaker_idx",
+        type=str,
+        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
+        default=None,
+    )
     args = parser.parse_args()
 
     # print the description if either text or list_models is not set
-    if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs:
+    if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav:
         parser.parse_args(["-h"])
 
     # load model manager
@@ -281,10 +292,11 @@ If you don't specify any models, then it uses LJSpeech based English model.
         return
 
     # RUN THE SYNTHESIS
-    print(" > Text: {}".format(args.text))
+    if args.text:
+        print(" > Text: {}".format(args.text))
 
     # kick it
-    wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav)
+    wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx)
 
     # save the results
     print(" > Saving output to {}".format(args.out_path))
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 6aa30dfe..818b9a54 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -994,6 +994,25 @@ class Vits(BaseTTS):
 
         outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p}
         return outputs
+    @torch.no_grad()
+    def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None):
+        """Inference for voice conversion
+
+        Args:
+            reference_wav (Tensor): Reference wavform. Tensor of shape [B, T]
+            speaker_id (Tensor): speaker_id of the target speaker. Tensor of shape [B]
+            d_vector (Tensor): d_vector embedding of target speaker. Tensor of shape `[B, C]`
+            reference_speaker_id (Tensor): speaker_id of the reference_wav speaker. Tensor of shape [B]
+            reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]`
+        """
+        # compute spectrograms
+        y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2)
+        y_lengths = torch.tensor([y.size(-1)]).to(y.device)
+        speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
+        speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
+        # print(y.shape, y_lengths.shape)
+        wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
+        return wav
 
     def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt):
         """Forward pass for voice conversion
@@ -1007,12 +1026,11 @@ class Vits(BaseTTS):
             speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,]
         """
         assert self.num_speakers > 0, "num_speakers have to be larger than 0."
-
         # speaker embedding
         if self.args.use_speaker_embedding and not self.args.use_d_vector_file:
             g_src = self.emb_g(speaker_cond_src).unsqueeze(-1)
             g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1)
-        elif self.args.use_speaker_embedding and self.args.use_d_vector_file:
+        elif not self.args.use_speaker_embedding and self.args.use_d_vector_file:
             g_src = F.normalize(speaker_cond_src).unsqueeze(-1)
             g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1)
         else:
@@ -1199,7 +1217,7 @@ class Vits(BaseTTS):
                 if speaker_name is None:
                     d_vector = self.speaker_manager.get_random_d_vector()
                 else:
-                    d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=1, randomize=False)
+                    d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False)
             elif config.use_speaker_embedding:
                 if speaker_name is None:
                     speaker_id = self.speaker_manager.get_random_speaker_id()
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 078ce3f1..c15a3abf 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -65,6 +65,7 @@ class SpeakerManager:
 
         self.d_vectors = {}
         self.speaker_ids = {}
+        self.d_vectors_by_speakers = {}
         self.clip_ids = []
         self.speaker_encoder = None
         self.speaker_encoder_ap = None
@@ -166,6 +167,8 @@ class SpeakerManager:
         self.speaker_ids = {name: i for i, name in enumerate(speakers)}
 
         self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys())))
+        # cache d_vectors_by_speakers for fast inference using a bigger speakers.json
+        self.d_vectors_by_speakers = self.get_d_vectors_by_speakers()
 
     def get_d_vector_by_clip(self, clip_idx: str) -> List:
         """Get d_vector by clip ID.
@@ -187,7 +190,21 @@ class SpeakerManager:
         Returns:
             List[List]: all the d_vectors of the given speaker.
         """
-        return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx]
+        return self.d_vectors_by_speakers[speaker_idx]
+
+    def get_d_vectors_by_speakers(self) -> Dict:
+        """Get all d_vectors by speaker.
+
+        Returns:
+            Dict: all the d_vectors of each speaker.
+        """
+        d_vectors_by_speakers = {}
+        for x in self.d_vectors.values():
+            if x["name"] not in d_vectors_by_speakers.keys():
+                d_vectors_by_speakers[x["name"]] = [x["embedding"]]
+            else:
+                d_vectors_by_speakers[x["name"]].append(x["embedding"])
+        return d_vectors_by_speakers
 
     def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
         """Get mean d_vector of a speaker ID.
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index b6e19ab4..582fb4f1 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -205,3 +205,88 @@ def synthesis(
         "outputs": outputs,
     }
     return return_dict
+
+def transfer_voice(
+    model,
+    CONFIG,
+    use_cuda,
+    reference_wav,
+    speaker_id=None,
+    d_vector=None,
+    reference_speaker_id=None,
+    reference_d_vector=None,
+    do_trim_silence=False,
+    use_griffin_lim=False,
+):
+    """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
+    the vocoder model.
+
+    Args:
+        model (TTS.tts.models):
+            The TTS model to synthesize audio with.
+
+        CONFIG (Coqpit):
+            Model configuration.
+
+        use_cuda (bool):
+            Enable/disable CUDA.
+
+        reference_wav (str):
+            Path of reference_wav to be used to voice conversion.
+
+        speaker_id (int):
+            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
+
+        d_vector (torch.Tensor):
+            d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
+
+        reference_speaker_id (int):
+            Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
+
+        reference_d_vector (torch.Tensor):
+            Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
+
+        enable_eos_bos_chars (bool):
+            enable special chars for end of sentence and start of sentence. Defaults to False.
+
+        do_trim_silence (bool):
+            trim silence after synthesis. Defaults to False.
+    """
+    # pass tensors to backend
+    if speaker_id is not None:
+        speaker_id = id_to_torch(speaker_id, cuda=use_cuda)
+
+    if d_vector is not None:
+        d_vector = embedding_to_torch(d_vector, cuda=use_cuda)
+
+    if reference_d_vector is not None:
+        reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)
+
+    # load reference_wav audio
+    reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda)
+
+    if hasattr(model, "module"):
+        _func = model.module.inference_voice_conversion
+    else:
+        _func = model.inference_voice_conversion
+    model_outputs = _func(
+        reference_wav,
+        speaker_id,
+        d_vector,
+        reference_speaker_id,
+        reference_d_vector)
+
+    # convert outputs to numpy
+    # plot results
+    wav = None
+    model_outputs = model_outputs.squeeze()
+    if model_outputs.ndim == 2:  # [T, C_spec]
+        if use_griffin_lim:
+            wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
+            # trim silence
+            if do_trim_silence:
+                wav = trim_silence(wav, model.ap)
+    else:  # [T,]
+        wav = model_outputs
+
+    return wav
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index d1abc907..687794b4 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -10,7 +10,7 @@ from TTS.tts.models import setup_model as setup_tts_model
 
 # pylint: disable=unused-wildcard-import
 # pylint: disable=wildcard-import
-from TTS.tts.utils.synthesis import synthesis, trim_silence
+from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.models import setup_model as setup_vocoder_model
 from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
@@ -114,10 +114,14 @@ class Synthesizer(object):
 
         if not self.encoder_checkpoint:
             self._set_speaker_encoder_paths_from_tts_config()
+
         self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
         if use_cuda:
             self.tts_model.cuda()
 
+        if self.encoder_checkpoint and  hasattr(self.tts_model, "speaker_manager"):
+            self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config)
+
     def _set_speaker_encoder_paths_from_tts_config(self):
         """Set the encoder paths from the tts model config for models with speaker encoders."""
         if hasattr(self.tts_config, "model_args") and hasattr(
@@ -169,11 +173,13 @@ class Synthesizer(object):
 
     def tts(
         self,
-        text: str,
+        text: str = "",
         speaker_name: str = "",
         language_name: str = "",
         speaker_wav: Union[str, List[str]] = None,
         style_wav=None,
+        reference_wav=None,
+        reference_speaker_name=None,
     ) -> List[int]:
         """🐸 TTS magic. Run all the models and generate speech.
 
@@ -183,15 +189,23 @@ class Synthesizer(object):
             language_name (str, optional): language id for multi-language models. Defaults to "".
             speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
             style_wav ([type], optional): style waveform for GST. Defaults to None.
-
+            reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
+            reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
         Returns:
             List[int]: [description]
         """
         start_time = time.time()
         wavs = []
-        sens = self.split_into_sentences(text)
-        print(" > Text splitted to sentences.")
-        print(sens)
+
+        if not text and not reference_wav:
+            raise ValueError(
+                    "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
+                )
+
+        if text:
+            sens = self.split_into_sentences(text)
+            print(" > Text splitted to sentences.")
+            print(sens)
 
         # handle multi-speaker
         speaker_embedding = None
@@ -199,8 +213,8 @@ class Synthesizer(object):
         if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
             if speaker_name and isinstance(speaker_name, str):
                 if self.tts_config.use_d_vector_file:
-                    # get the speaker embedding from the saved d_vectors.
-                    speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_name)[0]
+                    # get the average speaker embedding from the saved d_vectors.
+                    speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False)
                     speaker_embedding = np.array(speaker_embedding)[None, :]  # [1 x embedding_dim]
                 else:
                     # get speaker idx from the speaker name
@@ -209,7 +223,7 @@ class Synthesizer(object):
             elif not speaker_name and not speaker_wav:
                 raise ValueError(
                     " [!] Look like you use a multi-speaker model. "
-                    "You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model."
+                    "You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model."
                 )
             else:
                 speaker_embedding = None
@@ -246,22 +260,83 @@ class Synthesizer(object):
 
         use_gl = self.vocoder_model is None
 
-        for sen in sens:
-            # synthesize voice
-            outputs = synthesis(
-                model=self.tts_model,
-                text=sen,
-                CONFIG=self.tts_config,
-                use_cuda=self.use_cuda,
-                speaker_id=speaker_id,
-                language_id=language_id,
-                style_wav=style_wav,
-                use_griffin_lim=use_gl,
-                d_vector=speaker_embedding,
-            )
-            waveform = outputs["wav"]
-            mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
+        if not reference_wav:
+            for sen in sens:
+                # synthesize voice
+                outputs = synthesis(
+                    model=self.tts_model,
+                    text=sen,
+                    CONFIG=self.tts_config,
+                    use_cuda=self.use_cuda,
+                    speaker_id=speaker_id,
+                    language_id=language_id,
+                    style_wav=style_wav,
+                    use_griffin_lim=use_gl,
+                    d_vector=speaker_embedding,
+                )
+                waveform = outputs["wav"]
+                mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
+                if not use_gl:
+                    # denormalize tts output based on tts audio config
+                    mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T
+                    device_type = "cuda" if self.use_cuda else "cpu"
+                    # renormalize spectrogram based on vocoder config
+                    vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
+                    # compute scale factor for possible sample rate mismatch
+                    scale_factor = [
+                        1,
+                        self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
+                    ]
+                    if scale_factor[1] != 1:
+                        print(" > interpolating tts model output.")
+                        vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
+                    else:
+                        vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
+                    # run vocoder model
+                    # [1, T, C]
+                    waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
+                if self.use_cuda and not use_gl:
+                    waveform = waveform.cpu()
+                if not use_gl:
+                    waveform = waveform.numpy()
+                waveform = waveform.squeeze()
+
+                # trim silence
+                if self.tts_config.audio["do_trim_silence"] is True:
+                    waveform = trim_silence(waveform, self.tts_model.ap)
+
+                wavs += list(waveform)
+                wavs += [0] * 10000
+        else:
+            # get the speaker embedding or speaker id for the reference wav file
+            reference_speaker_embedding = None
+            reference_speaker_id = None
+            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
+                if reference_speaker_name and isinstance(reference_speaker_name, str):
+                    if self.tts_config.use_d_vector_file:
+                        # get the speaker embedding from the saved d_vectors.
+                        reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(reference_speaker_name)[0]
+                        reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :]  # [1 x embedding_dim]
+                    else:
+                        # get speaker idx from the speaker name
+                        reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name]
+                else:
+                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(reference_wav)
+
+            outputs = transfer_voice(
+                    model=self.tts_model,
+                    CONFIG=self.tts_config,
+                    use_cuda=self.use_cuda,
+                    reference_wav=reference_wav,
+                    speaker_id=speaker_id,
+                    d_vector=speaker_embedding,
+                    use_griffin_lim=use_gl,
+                    reference_speaker_id=reference_speaker_id,
+                    reference_d_vector=reference_speaker_embedding
+                )
+            waveform = outputs
             if not use_gl:
+                mel_postnet_spec = outputs[0].detach().cpu().numpy()
                 # denormalize tts output based on tts audio config
                 mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T
                 device_type = "cuda" if self.use_cuda else "cpu"
@@ -280,18 +355,11 @@ class Synthesizer(object):
                 # run vocoder model
                 # [1, T, C]
                 waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
-            if self.use_cuda and not use_gl:
+            if self.use_cuda:
                 waveform = waveform.cpu()
             if not use_gl:
                 waveform = waveform.numpy()
-            waveform = waveform.squeeze()
-
-            # trim silence
-            if self.tts_config.audio["do_trim_silence"] is True:
-                waveform = trim_silence(waveform, self.tts_model.ap)
-
-            wavs += list(waveform)
-            wavs += [0] * 10000
+            wavs = waveform.squeeze()
 
         # compute stats
         process_time = time.time() - start_time

From b0be825d9244ab4be1d4610913f74589bc5f1e49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Fri, 11 Mar 2022 10:40:20 +0100
Subject: [PATCH 09/38] Update issue template (#1370)

* Add bug_report template

* Fix typos
---
 .github/ISSUE_TEMPLATE/bug_report.md   | 58 ------------------
 .github/ISSUE_TEMPLATE/bug_report.yaml | 85 ++++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 58 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yaml

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
deleted file mode 100644
index 133346f6..00000000
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ /dev/null
@@ -1,58 +0,0 @@
----
-name: 🐛 Bug report
-about: Create a bug report to help 🐸 improve
-title: '[Bug] '
-labels: bug
-assignees: ''
-
----
-<!-- Welcome to the 🐸TTS!
-We are excited to see your interest, and appreciate your support! --->
-## 🐛 Description
-
-<!-- A clear and concise description of what the bug is. -->
-
-### To Reproduce
-
-<!--
-Please share your code to reproduce the error. Issues fixed faster if you can provide a working example.
-
-The best place for sharing code is colab. https://colab.research.google.com/
-So we can directly run your code and reproduce the issue.
-
-In the worse case provide steps to reproduce the behaviour.
-
-1. Run the following command '...'
-2. ...
-3. See error
--->
-
-### Expected behavior
-
-<!-- Write down what the expected behaviour -->
-
-### Environment
-
-<!--
-You can either run `TTS/bin/collect_env_info.py`
-
-```bash
-wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_info.py
-python collect_env_info.py
-```
-
-or fill in the fields below manually.
--->
-
-- 🐸TTS Version (e.g., 1.3.0):
-- PyTorch Version (e.g., 1.8)
-- Python version:
-- OS (e.g., Linux):
-- CUDA/cuDNN version:
-- GPU models and configuration:
-- How you installed PyTorch (`conda`, `pip`, source):
-- Any other relevant information:
-
-### Additional context
-
-<!-- Add any other context about the problem here. -->
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
new file mode 100644
index 00000000..34cde7e8
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -0,0 +1,85 @@
+name: "🐛 Bug report"
+description: Create a bug report to help 🐸 improve
+title: '[Bug] '
+labels: [ "bug" ]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Welcome to the 🐸TTS! Thanks for taking the time to fill out this bug report!
+
+  - type: textarea
+    id: bug-description
+    attributes:
+      label: Describe the bug
+      description: A clear and concise description of what the bug is. If you intend to submit a PR for this issue, tell us in the description. Thanks!
+      placeholder: Bug description
+    validations:
+      required: true
+
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: To Reproduce
+      description: |
+        Please share your code to reproduce the error.
+
+        Issues are fixed faster if you can provide a working example.
+
+        The best place for sharing code is colab. https://colab.research.google.com/
+        So we can directly run your code and reproduce the issue.
+
+        In the worse case, provide steps to reproduce the behavior.
+
+        1. Run the following command '...'
+        2. ...
+        3. See error
+      placeholder: Reproduction
+    validations:
+      required: true
+
+  - type: textarea
+    id: expected-behavior
+    attributes:
+      label: Expected behavior
+      description: "Write down what the expected behaviour"
+
+  - type: textarea
+    id: logs
+    attributes:
+      label: Logs
+      description: "Please include the relevant logs if you can."
+      render: shell
+
+  - type: textarea
+    id: system-info
+    attributes:
+      label: Environment
+      description: |
+        You can either run `TTS/bin/collect_env_info.py`
+
+        ```bash
+        wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_info.py
+        python collect_env_info.py
+        ```
+
+        or fill in the fields below manually.
+      render: shell
+      placeholder: |
+        - 🐸TTS Version (e.g., 1.3.0):
+        - PyTorch Version (e.g., 1.8)
+        - Python version:
+        - OS (e.g., Linux):
+        - CUDA/cuDNN version:
+        - GPU models and configuration:
+        - How you installed PyTorch (`conda`, `pip`, source):
+        - Any other relevant information:
+    validations:
+      required: true
+  - type: textarea
+    id: context
+    attributes:
+      label: Additional context
+      description: Add any other context about the problem here.
+    validations:
+      required: false

From 36e9ea2f97395bf6e4395557fe5c80260edf62d1 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Fri, 11 Mar 2022 06:43:31 -0300
Subject: [PATCH 10/38] Open bible dataset formatter (#1365)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add support for voice conversion inference

* Cache d_vectors_by_speaker for fast inference using a bigger speakers.json

* Rebase bug fix

* Use the average d-vector for inference

* Fix the bug in find unique chars script

* Add OpenBible formatter

Co-authored-by: Eren Gölge <erogol@hotmail.com>
---
 TTS/bin/find_unique_chars.py   |  2 +-
 TTS/tts/datasets/formatters.py | 24 +++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py
index 4689dcad..ea169748 100644
--- a/TTS/bin/find_unique_chars.py
+++ b/TTS/bin/find_unique_chars.py
@@ -29,7 +29,7 @@ def main():
 
     items = train_items + eval_items
 
-    texts = "".join(item[0] for item in items)
+    texts = "".join(item["text"] for item in items)
     chars = set(texts)
     lower_chars = filter(lambda c: c.islower(), chars)
     chars_force_lower = [c.lower() for c in chars]
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index ac3080c3..573a5deb 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -348,7 +348,29 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non
         with open(meta_file, "r", encoding="utf-8") as file_text:
             text = file_text.readlines()[0]
         wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
-        items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id})
+        items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id})
+    return items
+
+
+def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None):
+    """ToDo: Refer the paper when available"""
+    items = []
+    split_dir = meta_files
+    meta_files = glob(f"{os.path.join(root_path, split_dir)}/**/*.txt", recursive=True)
+    for meta_file in meta_files:
+        _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
+        file_id = txt_file.split(".")[0]
+        # ignore speakers
+        if isinstance(ignored_speakers, list):
+            if speaker_id in ignored_speakers:
+                continue
+        with open(meta_file, "r", encoding="utf-8") as file_text:
+            text = file_text.readline().replace("\n", "")
+        # ignore sentences that contains digits
+        if ignore_digits_sentences and any(map(str.isdigit, text)):
+            continue
+        wav_file = os.path.join(root_path, split_dir, speaker_id, file_id + ".flac")
+        items.append({"text": text, "audio_file": wav_file, "speaker_name": "OB_" + speaker_id})
     return items
 
 

From f81892483d720688005dab723e990e90a990f8a0 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Fri, 11 Mar 2022 10:43:40 -0300
Subject: [PATCH 11/38] REBASED: Transform Speaker Encoder in a Generic Encoder
 and Implement Emotion Encoder training support  (#1349)

* Rename Speaker encoder module to encoder

* Add a generic emotion dataset formatter

* Transform the Speaker Encoder dataset to a generic dataset and create emotion encoder config

* Add class map in emotion config

* Add Base encoder config

* Add evaluation encoder script

* Fix the bug in plot_embeddings

* Enable Weight decay for encoder training

* Add argumnet to disable storage

* Add Perfect Sampler and remove storage

* Add evaluation during encoder training

* Fix lint checks

* Remove useless config parameter

* Active evaluation in speaker encoder test and use multispeaker dataset for this test

* Unit tests fixs

* Remove useless tests for speedup the aux_tests

* Use get_optimizer in Encoder

* Add BaseEncoder Class

* Fix the unitests

* Add Perfect Batch Sampler unit test

* Add compute encoder accuracy in a function
---
 TTS/bin/compute_embeddings.py                 |  22 +-
 TTS/bin/eval_encoder.py                       |  88 ++
 TTS/bin/train_encoder.py                      | 334 +++---
 TTS/config/__init__.py                        |   2 +-
 TTS/{speaker_encoder => encoder}/README.md    |   0
 TTS/{speaker_encoder => encoder}/__init__.py  |   0
 .../configs/base_encoder_config.py}           |  31 +-
 TTS/encoder/configs/emotion_encoder_config.py |  12 +
 TTS/encoder/configs/speaker_encoder_config.py |  11 +
 TTS/encoder/dataset.py                        | 149 +++
 TTS/{speaker_encoder => encoder}/losses.py    |   5 +
 TTS/encoder/models/base_encoder.py            | 145 +++
 TTS/encoder/models/lstm.py                    |  99 ++
 .../models/resnet.py                          |  91 +-
 .../requirements.txt                          |   0
 .../utils/__init__.py                         |   0
 .../utils/generic_utils.py                    |  52 +-
 TTS/{speaker_encoder => encoder}/utils/io.py  |   0
 .../utils/prepare_voxceleb.py                 |   0
 TTS/encoder/utils/samplers.py                 | 102 ++
 .../utils/training.py                         |   0
 .../utils/visual.py                           |  14 +-
 TTS/speaker_encoder/configs/config.json       | 118 ---
 .../configs/config_resnet_angleproto.json     | 956 -----------------
 .../config_resnet_softmax_angleproto.json     | 957 ------------------
 TTS/speaker_encoder/dataset.py                | 243 -----
 TTS/speaker_encoder/models/lstm.py            | 189 ----
 TTS/speaker_encoder/umap.png                  | Bin 24616 -> 0 bytes
 TTS/tts/configs/shared_configs.py             |   2 +-
 TTS/tts/datasets/formatters.py                |  20 +
 TTS/tts/utils/speakers.py                     |   4 +-
 tests/aux_tests/test_speaker_encoder.py       |   6 +-
 tests/aux_tests/test_speaker_encoder_train.py |  30 +-
 tests/aux_tests/test_speaker_manager.py       |   4 +-
 tests/data_tests/test_samplers.py             |  49 +
 tests/inputs/test_glow_tts.json               |   4 +-
 tests/inputs/test_speaker_encoder_config.json |   4 +-
 tests/inputs/test_tacotron2_config.json       |   4 +-
 tests/inputs/test_tacotron_config.json        |   4 +-
 tests/tts_tests/test_vits.py                  |   2 +-
 40 files changed, 962 insertions(+), 2791 deletions(-)
 create mode 100644 TTS/bin/eval_encoder.py
 rename TTS/{speaker_encoder => encoder}/README.md (100%)
 rename TTS/{speaker_encoder => encoder}/__init__.py (100%)
 rename TTS/{speaker_encoder/speaker_encoder_config.py => encoder/configs/base_encoder_config.py} (66%)
 create mode 100644 TTS/encoder/configs/emotion_encoder_config.py
 create mode 100644 TTS/encoder/configs/speaker_encoder_config.py
 create mode 100644 TTS/encoder/dataset.py
 rename TTS/{speaker_encoder => encoder}/losses.py (97%)
 create mode 100644 TTS/encoder/models/base_encoder.py
 create mode 100644 TTS/encoder/models/lstm.py
 rename TTS/{speaker_encoder => encoder}/models/resnet.py (67%)
 rename TTS/{speaker_encoder => encoder}/requirements.txt (100%)
 rename TTS/{speaker_encoder => encoder}/utils/__init__.py (100%)
 rename TTS/{speaker_encoder => encoder}/utils/generic_utils.py (80%)
 rename TTS/{speaker_encoder => encoder}/utils/io.py (100%)
 rename TTS/{speaker_encoder => encoder}/utils/prepare_voxceleb.py (100%)
 create mode 100644 TTS/encoder/utils/samplers.py
 rename TTS/{speaker_encoder => encoder}/utils/training.py (100%)
 rename TTS/{speaker_encoder => encoder}/utils/visual.py (69%)
 delete mode 100644 TTS/speaker_encoder/configs/config.json
 delete mode 100644 TTS/speaker_encoder/configs/config_resnet_angleproto.json
 delete mode 100644 TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json
 delete mode 100644 TTS/speaker_encoder/dataset.py
 delete mode 100644 TTS/speaker_encoder/models/lstm.py
 delete mode 100644 TTS/speaker_encoder/umap.png

diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index 50817154..68571fb4 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -42,33 +42,35 @@ c_dataset = load_config(args.config_dataset_path)
 meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
 wav_files = meta_data_train + meta_data_eval
 
-speaker_manager = SpeakerManager(
+encoder_manager = SpeakerManager(
     encoder_model_path=args.model_path,
     encoder_config_path=args.config_path,
     d_vectors_file_path=args.old_file,
     use_cuda=args.use_cuda,
 )
 
+class_name_key = encoder_manager.speaker_encoder_config.class_name_key
+
 # compute speaker embeddings
 speaker_mapping = {}
 for idx, wav_file in enumerate(tqdm(wav_files)):
-    if isinstance(wav_file, list):
-        speaker_name = wav_file[2]
-        wav_file = wav_file[1]
+    if isinstance(wav_file, dict):
+        class_name = wav_file[class_name_key]
+        wav_file = wav_file["audio_file"]
     else:
-        speaker_name = None
+        class_name = None
 
     wav_file_name = os.path.basename(wav_file)
-    if args.old_file is not None and wav_file_name in speaker_manager.clip_ids:
+    if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
         # get the embedding from the old file
-        embedd = speaker_manager.get_d_vector_by_clip(wav_file_name)
+        embedd = encoder_manager.get_d_vector_by_clip(wav_file_name)
     else:
         # extract the embedding
-        embedd = speaker_manager.compute_d_vector_from_clip(wav_file)
+        embedd = encoder_manager.compute_d_vector_from_clip(wav_file)
 
     # create speaker_mapping if target dataset is defined
     speaker_mapping[wav_file_name] = {}
-    speaker_mapping[wav_file_name]["name"] = speaker_name
+    speaker_mapping[wav_file_name]["name"] = class_name
     speaker_mapping[wav_file_name]["embedding"] = embedd
 
 if speaker_mapping:
@@ -81,5 +83,5 @@ if speaker_mapping:
     os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
 
     # pylint: disable=W0212
-    speaker_manager._save_json(mapping_file_path, speaker_mapping)
+    encoder_manager._save_json(mapping_file_path, speaker_mapping)
     print("Speaker embeddings saved at:", mapping_file_path)
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
new file mode 100644
index 00000000..a03bfd82
--- /dev/null
+++ b/TTS/bin/eval_encoder.py
@@ -0,0 +1,88 @@
+import argparse
+import torch
+from argparse import RawTextHelpFormatter
+
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.speakers import SpeakerManager
+
+def compute_encoder_accuracy(dataset_items, encoder_manager):
+
+    class_name_key = encoder_manager.speaker_encoder_config.class_name_key
+    map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, 'map_classid_to_classname', None)
+
+    class_acc_dict = {}
+
+    # compute embeddings for all wav_files
+    for item in tqdm(dataset_items):
+        class_name = item[class_name_key]
+        wav_file = item["audio_file"]
+
+        # extract the embedding
+        embedd = encoder_manager.compute_d_vector_from_clip(wav_file)
+        if encoder_manager.speaker_encoder_criterion is not None and map_classid_to_classname is not None:
+            embedding = torch.FloatTensor(embedd).unsqueeze(0)
+            if encoder_manager.use_cuda:
+                embedding = embedding.cuda()
+
+            class_id = encoder_manager.speaker_encoder_criterion.softmax.inference(embedding).item()
+            predicted_label = map_classid_to_classname[str(class_id)]
+        else:
+            predicted_label = None
+
+        if class_name is not None and predicted_label is not None:
+            is_equal = int(class_name == predicted_label)
+            if class_name not in class_acc_dict:
+                class_acc_dict[class_name] = [is_equal]
+            else:
+                class_acc_dict[class_name].append(is_equal)
+        else:
+            raise RuntimeError("Error: class_name or/and predicted_label are None")
+
+    acc_avg = 0
+    for key, values in class_acc_dict.items():
+        acc = sum(values)/len(values)
+        print("Class", key, "Accuracy:", acc)
+        acc_avg += acc
+
+    print("Average Accuracy:", acc_avg/len(class_acc_dict))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Compute the accuracy of the encoder.\n\n"""
+        """
+        Example runs:
+        python TTS/bin/eval_encoder.py emotion_encoder_model.pth.tar emotion_encoder_config.json  dataset_config.json
+        """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
+    parser.add_argument(
+        "config_path",
+        type=str,
+        help="Path to model config file.",
+    )
+
+    parser.add_argument(
+        "config_dataset_path",
+        type=str,
+        help="Path to dataset config file.",
+    )
+    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
+    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+
+    args = parser.parse_args()
+
+    c_dataset = load_config(args.config_dataset_path)
+
+    meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
+    items = meta_data_train + meta_data_eval
+
+    enc_manager = SpeakerManager(
+        encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
+    )
+
+    compute_encoder_accuracy(items, enc_manager)
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index b7424698..af3e6ec4 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -10,16 +10,16 @@ import torch
 from torch.utils.data import DataLoader
 from trainer.torch import NoamLR
 
-from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
-from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
-from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_speaker_encoder_model
-from TTS.speaker_encoder.utils.training import init_training
-from TTS.speaker_encoder.utils.visual import plot_embeddings
+from TTS.encoder.dataset import EncoderDataset
+from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_speaker_encoder_model
+from TTS.encoder.utils.samplers import PerfectBatchSampler
+from TTS.encoder.utils.training import init_training
+from TTS.encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets import load_tts_samples
 from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict
-from TTS.utils.io import load_fsspec
-from TTS.utils.radam import RAdam
+from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
+from TTS.utils.io import copy_model_files
+from trainer.trainer_utils import get_optimizer
 from TTS.utils.training import check_update
 
 torch.backends.cudnn.enabled = True
@@ -32,164 +32,238 @@ print(" > Number of GPUs: ", num_gpus)
 
 
 def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
+    num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
+    num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
+
+    dataset = EncoderDataset(
+        c,
+        ap,
+        meta_data_eval if is_val else meta_data_train,
+        voice_len=c.voice_len,
+        num_utter_per_class=num_utter_per_class,
+        num_classes_in_batch=num_classes_in_batch,
+        verbose=verbose,
+        augmentation_config=c.audio_augmentation if not is_val else None,
+        use_torch_spec=c.model_params.get("use_torch_spec", False),
+    )
+    # get classes list
+    classes = dataset.get_class_list()
+
+    sampler = PerfectBatchSampler(
+        dataset.items,
+        classes,
+        batch_size=num_classes_in_batch*num_utter_per_class, # total batch size
+        num_classes_in_batch=num_classes_in_batch,
+        num_gpus=1,
+        shuffle=not is_val,
+        drop_last=True)
+
+    if len(classes) < num_classes_in_batch:
+        if is_val:
+            raise RuntimeError(f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !")
+        raise RuntimeError(f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !")
+
+    # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
     if is_val:
-        loader = None
-    else:
-        dataset = SpeakerEncoderDataset(
-            ap,
-            meta_data_eval if is_val else meta_data_train,
-            voice_len=c.voice_len,
-            num_utter_per_speaker=c.num_utters_per_speaker,
-            num_speakers_in_batch=c.num_speakers_in_batch,
-            skip_speakers=c.skip_speakers,
-            storage_size=c.storage["storage_size"],
-            sample_from_storage_p=c.storage["sample_from_storage_p"],
-            verbose=verbose,
-            augmentation_config=c.audio_augmentation,
-            use_torch_spec=c.model_params.get("use_torch_spec", False),
-        )
+        dataset.set_classes(train_classes)
 
-        # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
-        loader = DataLoader(
-            dataset,
-            batch_size=c.num_speakers_in_batch,
-            shuffle=False,
-            num_workers=c.num_loader_workers,
-            collate_fn=dataset.collate_fn,
-        )
-    return loader, dataset.get_num_speakers()
+    loader = DataLoader(
+        dataset,
+        num_workers=c.num_loader_workers,
+        batch_sampler=sampler,
+        collate_fn=dataset.collate_fn,
+    )
 
+    return loader, classes, dataset.get_map_classid_to_classname()
 
-def train(model, optimizer, scheduler, criterion, data_loader, global_step):
+def evaluation(model, criterion, data_loader, global_step):
+    eval_loss = 0
+    for _, data in enumerate(data_loader):
+        with torch.no_grad():
+            # setup input data
+            inputs, labels = data
+
+            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+            labels = torch.transpose(labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1).reshape(labels.shape)
+            inputs = torch.transpose(inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+
+            # dispatch data to GPU
+            if use_cuda:
+                inputs = inputs.cuda(non_blocking=True)
+                labels = labels.cuda(non_blocking=True)
+
+            # forward pass model
+            outputs = model(inputs)
+
+            # loss computation
+            loss = criterion(outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels)
+
+            eval_loss += loss.item()
+
+    eval_avg_loss = eval_loss/len(data_loader)
+    # save stats
+    dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
+    # plot the last batch in the evaluation
+    figures = {
+            "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+    }
+    dashboard_logger.eval_figures(global_step, figures)
+    return eval_avg_loss
+
+def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
     model.train()
-    epoch_time = 0
     best_loss = float("inf")
-    avg_loss = 0
-    avg_loss_all = 0
     avg_loader_time = 0
     end_time = time.time()
+    for epoch in range(c.epochs):
+        tot_loss = 0
+        epoch_time = 0
+        for _, data in enumerate(data_loader):
+            start_time = time.time()
 
-    for _, data in enumerate(data_loader):
-        start_time = time.time()
+            # setup input data
+            inputs, labels = data
+            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+            labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
+            inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+            # ToDo: move it to a unit test
+            # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
+            # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+            # idx = 0
+            # for j in range(0, c.num_classes_in_batch, 1):
+            #     for i in range(j, len(labels), c.num_classes_in_batch):
+            #         if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
+            #             print("Invalid")
+            #             print(labels)
+            #             exit()
+            #         idx += 1
+            # labels = labels_converted
+            # inputs = inputs_converted
 
-        # setup input data
-        inputs, labels = data
-        loader_time = time.time() - end_time
-        global_step += 1
+            loader_time = time.time() - end_time
+            global_step += 1
 
-        # setup lr
-        if c.lr_decay:
-            scheduler.step()
-        optimizer.zero_grad()
+            # setup lr
+            if c.lr_decay:
+                scheduler.step()
+            optimizer.zero_grad()
 
-        # dispatch data to GPU
-        if use_cuda:
-            inputs = inputs.cuda(non_blocking=True)
-            labels = labels.cuda(non_blocking=True)
+            # dispatch data to GPU
+            if use_cuda:
+                inputs = inputs.cuda(non_blocking=True)
+                labels = labels.cuda(non_blocking=True)
 
-        # forward pass model
-        outputs = model(inputs)
+            # forward pass model
+            outputs = model(inputs)
 
-        # loss computation
-        loss = criterion(outputs.view(c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1), labels)
-        loss.backward()
-        grad_norm, _ = check_update(model, c.grad_clip)
-        optimizer.step()
+            # loss computation
+            loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels)
+            loss.backward()
+            grad_norm, _ = check_update(model, c.grad_clip)
+            optimizer.step()
 
-        step_time = time.time() - start_time
-        epoch_time += step_time
+            step_time = time.time() - start_time
+            epoch_time += step_time
 
-        # Averaged Loss and Averaged Loader Time
-        avg_loss = 0.01 * loss.item() + 0.99 * avg_loss if avg_loss != 0 else loss.item()
-        num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
-        avg_loader_time = (
-            1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
-            if avg_loader_time != 0
-            else loader_time
-        )
-        current_lr = optimizer.param_groups[0]["lr"]
+            # acumulate the total epoch loss
+            tot_loss += loss.item()
 
-        if global_step % c.steps_plot_stats == 0:
-            # Plot Training Epoch Stats
-            train_stats = {
-                "loss": avg_loss,
-                "lr": current_lr,
-                "grad_norm": grad_norm,
-                "step_time": step_time,
-                "avg_loader_time": avg_loader_time,
-            }
-            dashboard_logger.train_epoch_stats(global_step, train_stats)
-            figures = {
-                # FIXME: not constant
-                "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), 10),
-            }
-            dashboard_logger.train_figures(global_step, figures)
-
-        if global_step % c.print_step == 0:
-            print(
-                "   | > Step:{}  Loss:{:.5f}  AvgLoss:{:.5f}  GradNorm:{:.5f}  "
-                "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
-                    global_step, loss.item(), avg_loss, grad_norm, step_time, loader_time, avg_loader_time, current_lr
-                ),
-                flush=True,
+            # Averaged Loader Time
+            num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
+            avg_loader_time = (
+                1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
+                if avg_loader_time != 0
+                else loader_time
             )
-        avg_loss_all += avg_loss
+            current_lr = optimizer.param_groups[0]["lr"]
 
-        if global_step >= c.max_train_step or global_step % c.save_step == 0:
-            # save best model only
-            best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step)
-            avg_loss_all = 0
-            if global_step >= c.max_train_step:
-                break
+            if global_step % c.steps_plot_stats == 0:
+                # Plot Training Epoch Stats
+                train_stats = {
+                    "loss": loss.item(),
+                    "lr": current_lr,
+                    "grad_norm": grad_norm,
+                    "step_time": step_time,
+                    "avg_loader_time": avg_loader_time,
+                }
+                dashboard_logger.train_epoch_stats(global_step, train_stats)
+                figures = {
+                    "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+                }
+                dashboard_logger.train_figures(global_step, figures)
 
-        end_time = time.time()
+            if global_step % c.print_step == 0:
+                print(
+                    "   | > Step:{}  Loss:{:.5f}  GradNorm:{:.5f}  "
+                    "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
+                        global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
+                    ),
+                    flush=True,
+                )
 
-    return avg_loss, global_step
+            if global_step % c.save_step == 0:
+                # save model
+                save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
+
+            end_time = time.time()
+
+        print("")
+        print(
+            ">>> Epoch:{}  AvgLoss: {:.5f} GradNorm:{:.5f}  "
+            "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
+                epoch, tot_loss/len(data_loader), grad_norm, epoch_time, avg_loader_time
+            ),
+            flush=True,
+        )
+        # evaluation
+        if c.run_eval:
+            model.eval()
+            eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
+            print("\n\n")
+            print("--> EVAL PERFORMANCE")
+            print(
+            "   | > Epoch:{}  AvgLoss: {:.5f} ".format(
+                epoch, eval_loss
+            ),
+            flush=True,
+            )
+            # save the best checkpoint
+            best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
+            model.train()
+
+    return best_loss, global_step
 
 
 def main(args):  # pylint: disable=redefined-outer-name
     # pylint: disable=global-variable-undefined
     global meta_data_train
     global meta_data_eval
+    global train_classes
 
     ap = AudioProcessor(**c.audio)
     model = setup_speaker_encoder_model(c)
 
-    optimizer = RAdam(model.parameters(), lr=c.lr)
+    optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
 
     # pylint: disable=redefined-outer-name
-    meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=False)
+    meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
 
-    data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True)
-
-    if c.loss == "ge2e":
-        criterion = GE2ELoss(loss_method="softmax")
-    elif c.loss == "angleproto":
-        criterion = AngleProtoLoss()
-    elif c.loss == "softmaxproto":
-        criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_speakers)
+    train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
+    if c.run_eval:
+        eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
     else:
-        raise Exception("The %s  not is a loss supported" % c.loss)
+        eval_data_loader = None
+
+    num_classes = len(train_classes)
+    criterion = model.get_criterion(c, num_classes)
+
+    if c.loss == "softmaxproto" and c.model != "speaker_encoder":
+        c.map_classid_to_classname = map_classid_to_classname
+        copy_model_files(c, OUT_PATH)
 
     if args.restore_path:
-        checkpoint = load_fsspec(args.restore_path)
-        try:
-            model.load_state_dict(checkpoint["model"])
-
-            if "criterion" in checkpoint:
-                criterion.load_state_dict(checkpoint["criterion"])
-
-        except (KeyError, RuntimeError):
-            print(" > Partial model initialization.")
-            model_dict = model.state_dict()
-            model_dict = set_init_dict(model_dict, checkpoint["model"], c)
-            model.load_state_dict(model_dict)
-            del model_dict
-        for group in optimizer.param_groups:
-            group["lr"] = c.lr
-
-        print(" > Model restored from step %d" % checkpoint["step"], flush=True)
-        args.restore_step = checkpoint["step"]
+        criterion, args.restore_step = model.load_checkpoint(c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion)
+        print(" > Model restored from step %d" % args.restore_step, flush=True)
     else:
         args.restore_step = 0
 
@@ -206,7 +280,7 @@ def main(args):  # pylint: disable=redefined-outer-name
         criterion.cuda()
 
     global_step = args.restore_step
-    _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step)
+    _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
 
 
 if __name__ == "__main__":
diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py
index 5c905295..6b0778c5 100644
--- a/TTS/config/__init__.py
+++ b/TTS/config/__init__.py
@@ -37,7 +37,7 @@ def register_config(model_name: str) -> Coqpit:
     """
     config_class = None
     config_name = model_name + "_config"
-    paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"]
+    paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
     for path in paths:
         try:
             config_class = find_module(path, config_name)
diff --git a/TTS/speaker_encoder/README.md b/TTS/encoder/README.md
similarity index 100%
rename from TTS/speaker_encoder/README.md
rename to TTS/encoder/README.md
diff --git a/TTS/speaker_encoder/__init__.py b/TTS/encoder/__init__.py
similarity index 100%
rename from TTS/speaker_encoder/__init__.py
rename to TTS/encoder/__init__.py
diff --git a/TTS/speaker_encoder/speaker_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py
similarity index 66%
rename from TTS/speaker_encoder/speaker_encoder_config.py
rename to TTS/encoder/configs/base_encoder_config.py
index 8212acc7..02b88d66 100644
--- a/TTS/speaker_encoder/speaker_encoder_config.py
+++ b/TTS/encoder/configs/base_encoder_config.py
@@ -7,10 +7,10 @@ from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTr
 
 
 @dataclass
-class SpeakerEncoderConfig(BaseTrainingConfig):
-    """Defines parameters for Speaker Encoder model."""
+class BaseEncoderConfig(BaseTrainingConfig):
+    """Defines parameters for a Generic Encoder model."""
 
-    model: str = "speaker_encoder"
+    model: str = None
     audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
     datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # model params
@@ -27,34 +27,33 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
 
     audio_augmentation: Dict = field(default_factory=lambda: {})
 
-    storage: Dict = field(
-        default_factory=lambda: {
-            "sample_from_storage_p": 0.66,  # the probability with which we'll sample from the DataSet in-memory storage
-            "storage_size": 15,  # the size of the in-memory storage with respect to a single batch
-        }
-    )
-
     # training params
-    max_train_step: int = 1000000  # end training when number of training steps reaches this value.
+    epochs: int = 10000
     loss: str = "angleproto"
     grad_clip: float = 3.0
     lr: float = 0.0001
+    optimizer: str = "radam"
+    optimizer_params: Dict = field(default_factory=lambda: {
+        "betas": [0.9, 0.999],
+        "weight_decay": 0
+    })
     lr_decay: bool = False
     warmup_steps: int = 4000
-    wd: float = 1e-6
 
     # logging params
     tb_model_param_stats: bool = False
     steps_plot_stats: int = 10
-    checkpoint: bool = True
     save_step: int = 1000
     print_step: int = 20
+    run_eval: bool = False
 
     # data loader
-    num_speakers_in_batch: int = MISSING
-    num_utters_per_speaker: int = MISSING
+    num_classes_in_batch: int = MISSING
+    num_utter_per_class: int = MISSING
+    eval_num_classes_in_batch: int = None
+    eval_num_utter_per_class: int = None
+
     num_loader_workers: int = MISSING
-    skip_speakers: bool = False
     voice_len: float = 1.6
 
     def check_values(self):
diff --git a/TTS/encoder/configs/emotion_encoder_config.py b/TTS/encoder/configs/emotion_encoder_config.py
new file mode 100644
index 00000000..5eda2671
--- /dev/null
+++ b/TTS/encoder/configs/emotion_encoder_config.py
@@ -0,0 +1,12 @@
+from dataclasses import asdict, dataclass
+
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+
+
+@dataclass
+class EmotionEncoderConfig(BaseEncoderConfig):
+    """Defines parameters for Emotion Encoder model."""
+
+    model: str = "emotion_encoder"
+    map_classid_to_classname: dict = None
+    class_name_key: str = "emotion_name"
diff --git a/TTS/encoder/configs/speaker_encoder_config.py b/TTS/encoder/configs/speaker_encoder_config.py
new file mode 100644
index 00000000..6dceb002
--- /dev/null
+++ b/TTS/encoder/configs/speaker_encoder_config.py
@@ -0,0 +1,11 @@
+from dataclasses import asdict, dataclass
+
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+
+
+@dataclass
+class SpeakerEncoderConfig(BaseEncoderConfig):
+    """Defines parameters for Speaker Encoder model."""
+
+    model: str = "speaker_encoder"
+    class_name_key: str = "speaker_name"
diff --git a/TTS/encoder/dataset.py b/TTS/encoder/dataset.py
new file mode 100644
index 00000000..a4db021b
--- /dev/null
+++ b/TTS/encoder/dataset.py
@@ -0,0 +1,149 @@
+import random
+
+import torch
+from torch.utils.data import Dataset
+
+from TTS.encoder.utils.generic_utils import AugmentWAV
+
+class EncoderDataset(Dataset):
+    def __init__(
+        self,
+        config,
+        ap,
+        meta_data,
+        voice_len=1.6,
+        num_classes_in_batch=64,
+        num_utter_per_class=10,
+        verbose=False,
+        augmentation_config=None,
+        use_torch_spec=None,
+    ):
+        """
+        Args:
+            ap (TTS.tts.utils.AudioProcessor): audio processor object.
+            meta_data (list): list of dataset instances.
+            seq_len (int): voice segment length in seconds.
+            verbose (bool): print diagnostic information.
+        """
+        super().__init__()
+        self.config = config
+        self.items = meta_data
+        self.sample_rate = ap.sample_rate
+        self.seq_len = int(voice_len * self.sample_rate)
+        self.num_utter_per_class = num_utter_per_class
+        self.ap = ap
+        self.verbose = verbose
+        self.use_torch_spec = use_torch_spec
+        self.classes, self.items = self.__parse_items()
+
+        self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+
+        # Data Augmentation
+        self.augmentator = None
+        self.gaussian_augmentation_config = None
+        if augmentation_config:
+            self.data_augmentation_p = augmentation_config["p"]
+            if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
+                self.augmentator = AugmentWAV(ap, augmentation_config)
+
+            if "gaussian" in augmentation_config.keys():
+                self.gaussian_augmentation_config = augmentation_config["gaussian"]
+
+        if self.verbose:
+            print("\n > DataLoader initialization")
+            print(f" | > Classes per Batch: {num_classes_in_batch}")
+            print(f" | > Number of instances : {len(self.items)}")
+            print(f" | > Sequence length: {self.seq_len}")
+            print(f" | > Num Classes: {len(self.classes)}")
+            print(f" | > Classes: {self.classes}")
+
+
+    def load_wav(self, filename):
+        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
+        return audio
+
+    def __parse_items(self):
+        class_to_utters = {}
+        for item in self.items:
+            path_ = item["audio_file"]
+            class_name = item[self.config.class_name_key]
+            if class_name in class_to_utters.keys():
+                class_to_utters[class_name].append(path_)
+            else:
+                class_to_utters[class_name] = [
+                    path_,
+                ]
+
+        # skip classes with number of samples >= self.num_utter_per_class
+        class_to_utters = {
+            k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class
+        }
+
+        classes = list(class_to_utters.keys())
+        classes.sort()
+
+        new_items = []
+        for item in self.items:
+            path_ = item["audio_file"]
+            class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
+            # ignore filtered classes
+            if class_name not in classes:
+                continue
+            # ignore small audios
+            if self.load_wav(path_).shape[0] - self.seq_len <= 0:
+                continue
+
+            new_items.append({"wav_file_path": path_, "class_name": class_name})
+
+        return classes, new_items
+
+    def __len__(self):
+        return len(self.items)
+
+    def get_num_classes(self):
+        return len(self.classes)
+
+    def get_class_list(self):
+        return self.classes
+    def set_classes(self, classes):
+        self.classes = classes
+        self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+
+
+    def get_map_classid_to_classname(self):
+        return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
+
+    def __getitem__(self, idx):
+        return self.items[idx]
+
+    def collate_fn(self, batch):
+        # get the batch class_ids
+        labels = []
+        feats = []
+        for item in batch:
+            utter_path = item["wav_file_path"]
+            class_name = item["class_name"]
+
+            # get classid
+            class_id = self.classname_to_classid[class_name]
+            # load wav file
+            wav = self.load_wav(utter_path)
+            offset = random.randint(0, wav.shape[0] - self.seq_len)
+            wav = wav[offset : offset + self.seq_len]
+
+            if self.augmentator is not None and self.data_augmentation_p:
+                if random.random() < self.data_augmentation_p:
+                    wav = self.augmentator.apply_one(wav)
+
+            if not self.use_torch_spec:
+                mel = self.ap.melspectrogram(wav)
+                feats.append(torch.FloatTensor(mel))
+            else:
+                feats.append(torch.FloatTensor(wav))
+
+            labels.append(class_id)
+
+        feats = torch.stack(feats)
+        labels = torch.LongTensor(labels)
+
+        return feats, labels
diff --git a/TTS/speaker_encoder/losses.py b/TTS/encoder/losses.py
similarity index 97%
rename from TTS/speaker_encoder/losses.py
rename to TTS/encoder/losses.py
index 8ba917b7..de65d8d6 100644
--- a/TTS/speaker_encoder/losses.py
+++ b/TTS/encoder/losses.py
@@ -189,6 +189,11 @@ class SoftmaxLoss(nn.Module):
 
         return L
 
+    def inference(self, embedding):
+        x = self.fc(embedding)
+        activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
+        class_id = torch.argmax(activations)
+        return class_id
 
 class SoftmaxAngleProtoLoss(nn.Module):
     """
diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py
new file mode 100644
index 00000000..c35c636d
--- /dev/null
+++ b/TTS/encoder/models/base_encoder.py
@@ -0,0 +1,145 @@
+import torch
+import torchaudio
+import numpy as np
+from torch import nn
+
+from TTS.utils.io import load_fsspec
+from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
+from TTS.utils.generic_utils import set_init_dict
+from coqpit import Coqpit
+
+class PreEmphasis(nn.Module):
+    def __init__(self, coefficient=0.97):
+        super().__init__()
+        self.coefficient = coefficient
+        self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
+
+    def forward(self, x):
+        assert len(x.size()) == 2
+
+        x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
+        return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
+
+class BaseEncoder(nn.Module):
+    """Base `encoder` class. Every new `encoder` model must inherit this.
+
+    It defines common `encoder` specific functions.
+    """
+
+    # pylint: disable=W0102
+    def __init__(self):
+        super(BaseEncoder, self).__init__()
+
+    def get_torch_mel_spectrogram_class(self, audio_config):
+        return torch.nn.Sequential(
+                PreEmphasis(audio_config["preemphasis"]),
+                # TorchSTFT(
+                #     n_fft=audio_config["fft_size"],
+                #     hop_length=audio_config["hop_length"],
+                #     win_length=audio_config["win_length"],
+                #     sample_rate=audio_config["sample_rate"],
+                #     window="hamming_window",
+                #     mel_fmin=0.0,
+                #     mel_fmax=None,
+                #     use_htk=True,
+                #     do_amp_to_db=False,
+                #     n_mels=audio_config["num_mels"],
+                #     power=2.0,
+                #     use_mel=True,
+                #     mel_norm=None,
+                # )
+                torchaudio.transforms.MelSpectrogram(
+                    sample_rate=audio_config["sample_rate"],
+                    n_fft=audio_config["fft_size"],
+                    win_length=audio_config["win_length"],
+                    hop_length=audio_config["hop_length"],
+                    window_fn=torch.hamming_window,
+                    n_mels=audio_config["num_mels"],
+                )
+            )
+
+    @torch.no_grad()
+    def inference(self, x, l2_norm=True):
+        return self.forward(x, l2_norm)
+
+    @torch.no_grad()
+    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
+        """
+        Generate embeddings for a batch of utterances
+        x: 1xTxD
+        """
+        # map to the waveform size
+        if self.use_torch_spec:
+            num_frames = num_frames * self.audio_config["hop_length"]
+
+        max_len = x.shape[1]
+
+        if max_len < num_frames:
+            num_frames = max_len
+
+        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
+
+        frames_batch = []
+        for offset in offsets:
+            offset = int(offset)
+            end_offset = int(offset + num_frames)
+            frames = x[:, offset:end_offset]
+            frames_batch.append(frames)
+
+        frames_batch = torch.cat(frames_batch, dim=0)
+        embeddings = self.inference(frames_batch, l2_norm=l2_norm)
+
+        if return_mean:
+            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
+        return embeddings
+
+    def get_criterion(self, c: Coqpit, num_classes=None):
+        if c.loss == "ge2e":
+            criterion = GE2ELoss(loss_method="softmax")
+        elif c.loss == "angleproto":
+            criterion = AngleProtoLoss()
+        elif c.loss == "softmaxproto":
+            criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
+        else:
+            raise Exception("The %s  not is a loss supported" % c.loss)
+        return criterion
+
+    def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None):
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        try:
+            self.load_state_dict(state["model"])
+        except (KeyError, RuntimeError) as error:
+            # If eval raise the error
+            if eval:
+                raise error
+
+            print(" > Partial model initialization.")
+            model_dict = self.state_dict()
+            model_dict = set_init_dict(model_dict, state["model"], c)
+            self.load_state_dict(model_dict)
+            del model_dict
+
+        # load the criterion for restore_path
+        if criterion is not None and "criterion" in state:
+            try:
+                criterion.load_state_dict(state["criterion"])
+            except (KeyError, RuntimeError) as error:
+                print(" > Criterion load ignored because of:", error)
+
+        # instance and load the criterion for the encoder classifier in inference time
+        if eval and criterion is None and "criterion" in state and getattr(config, 'map_classid_to_classname', None) is not None:
+            criterion = self.get_criterion(config, len(config.map_classid_to_classname))
+            criterion.load_state_dict(state["criterion"])
+
+        if use_cuda:
+            self.cuda()
+            if criterion is not None:
+                criterion = criterion.cuda()
+
+        if eval:
+            self.eval()
+            assert not self.training
+
+        if not eval:
+            return criterion, state["step"]
+        return criterion
diff --git a/TTS/encoder/models/lstm.py b/TTS/encoder/models/lstm.py
new file mode 100644
index 00000000..51852b5b
--- /dev/null
+++ b/TTS/encoder/models/lstm.py
@@ -0,0 +1,99 @@
+import torch
+from torch import nn
+
+from TTS.encoder.models.base_encoder import BaseEncoder
+
+
+class LSTMWithProjection(nn.Module):
+    def __init__(self, input_size, hidden_size, proj_size):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.proj_size = proj_size
+        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+        self.linear = nn.Linear(hidden_size, proj_size, bias=False)
+
+    def forward(self, x):
+        self.lstm.flatten_parameters()
+        o, (_, _) = self.lstm(x)
+        return self.linear(o)
+
+
+class LSTMWithoutProjection(nn.Module):
+    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
+        super().__init__()
+        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
+        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        _, (hidden, _) = self.lstm(x)
+        return self.relu(self.linear(hidden[-1]))
+
+
+class LSTMSpeakerEncoder(BaseEncoder):
+    def __init__(
+        self,
+        input_dim,
+        proj_dim=256,
+        lstm_dim=768,
+        num_lstm_layers=3,
+        use_lstm_with_projection=True,
+        use_torch_spec=False,
+        audio_config=None,
+    ):
+        super().__init__()
+        self.use_lstm_with_projection = use_lstm_with_projection
+        self.use_torch_spec = use_torch_spec
+        self.audio_config = audio_config
+        self.proj_dim = proj_dim
+
+        layers = []
+        # choise LSTM layer
+        if use_lstm_with_projection:
+            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
+            for _ in range(num_lstm_layers - 1):
+                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
+            self.layers = nn.Sequential(*layers)
+        else:
+            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
+
+        self.instancenorm = nn.InstanceNorm1d(input_dim)
+
+        if self.use_torch_spec:
+            self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
+        else:
+            self.torch_spec = None
+
+        self._init_layers()
+
+    def _init_layers(self):
+        for name, param in self.layers.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0.0)
+            elif "weight" in name:
+                nn.init.xavier_normal_(param)
+
+    def forward(self, x, l2_norm=True):
+        """Forward pass of the model.
+
+        Args:
+            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
+                to compute the spectrogram on-the-fly.
+            l2_norm (bool): Whether to L2-normalize the outputs.
+
+        Shapes:
+            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
+        """
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                if self.use_torch_spec:
+                    x.squeeze_(1)
+                    x = self.torch_spec(x)
+                x = self.instancenorm(x).transpose(1, 2)
+        d = self.layers(x)
+        if self.use_lstm_with_projection:
+            d = d[:, -1]
+        if l2_norm:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
+        return d
diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/encoder/models/resnet.py
similarity index 67%
rename from TTS/speaker_encoder/models/resnet.py
rename to TTS/encoder/models/resnet.py
index a799fc52..c4ba9537 100644
--- a/TTS/speaker_encoder/models/resnet.py
+++ b/TTS/encoder/models/resnet.py
@@ -1,24 +1,8 @@
-import numpy as np
 import torch
-import torchaudio
 from torch import nn
 
 # from TTS.utils.audio import TorchSTFT
-from TTS.utils.io import load_fsspec
-
-
-class PreEmphasis(nn.Module):
-    def __init__(self, coefficient=0.97):
-        super().__init__()
-        self.coefficient = coefficient
-        self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
-
-    def forward(self, x):
-        assert len(x.size()) == 2
-
-        x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
-        return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
-
+from TTS.encoder.models.base_encoder import BaseEncoder
 
 class SELayer(nn.Module):
     def __init__(self, channel, reduction=8):
@@ -71,7 +55,7 @@ class SEBasicBlock(nn.Module):
         return out
 
 
-class ResNetSpeakerEncoder(nn.Module):
+class ResNetSpeakerEncoder(BaseEncoder):
     """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
     Adapted from: https://github.com/clovaai/voxceleb_trainer
     """
@@ -110,32 +94,7 @@ class ResNetSpeakerEncoder(nn.Module):
         self.instancenorm = nn.InstanceNorm1d(input_dim)
 
         if self.use_torch_spec:
-            self.torch_spec = torch.nn.Sequential(
-                PreEmphasis(audio_config["preemphasis"]),
-                # TorchSTFT(
-                #     n_fft=audio_config["fft_size"],
-                #     hop_length=audio_config["hop_length"],
-                #     win_length=audio_config["win_length"],
-                #     sample_rate=audio_config["sample_rate"],
-                #     window="hamming_window",
-                #     mel_fmin=0.0,
-                #     mel_fmax=None,
-                #     use_htk=True,
-                #     do_amp_to_db=False,
-                #     n_mels=audio_config["num_mels"],
-                #     power=2.0,
-                #     use_mel=True,
-                #     mel_norm=None,
-                # )
-                torchaudio.transforms.MelSpectrogram(
-                    sample_rate=audio_config["sample_rate"],
-                    n_fft=audio_config["fft_size"],
-                    win_length=audio_config["win_length"],
-                    hop_length=audio_config["hop_length"],
-                    window_fn=torch.hamming_window,
-                    n_mels=audio_config["num_mels"],
-                ),
-            )
+            self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
         else:
             self.torch_spec = None
 
@@ -238,47 +197,3 @@ class ResNetSpeakerEncoder(nn.Module):
         if l2_norm:
             x = torch.nn.functional.normalize(x, p=2, dim=1)
         return x
-
-    @torch.no_grad()
-    def inference(self, x, l2_norm=False):
-        return self.forward(x, l2_norm)
-
-    @torch.no_grad()
-    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
-        """
-        Generate embeddings for a batch of utterances
-        x: 1xTxD
-        """
-        # map to the waveform size
-        if self.use_torch_spec:
-            num_frames = num_frames * self.audio_config["hop_length"]
-
-        max_len = x.shape[1]
-
-        if max_len < num_frames:
-            num_frames = max_len
-
-        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
-
-        frames_batch = []
-        for offset in offsets:
-            offset = int(offset)
-            end_offset = int(offset + num_frames)
-            frames = x[:, offset:end_offset]
-            frames_batch.append(frames)
-
-        frames_batch = torch.cat(frames_batch, dim=0)
-        embeddings = self.inference(frames_batch, l2_norm=l2_norm)
-
-        if return_mean:
-            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
-        return embeddings
-
-    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
-        self.load_state_dict(state["model"])
-        if use_cuda:
-            self.cuda()
-        if eval:
-            self.eval()
-            assert not self.training
diff --git a/TTS/speaker_encoder/requirements.txt b/TTS/encoder/requirements.txt
similarity index 100%
rename from TTS/speaker_encoder/requirements.txt
rename to TTS/encoder/requirements.txt
diff --git a/TTS/speaker_encoder/utils/__init__.py b/TTS/encoder/utils/__init__.py
similarity index 100%
rename from TTS/speaker_encoder/utils/__init__.py
rename to TTS/encoder/utils/__init__.py
diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
similarity index 80%
rename from TTS/speaker_encoder/utils/generic_utils.py
rename to TTS/encoder/utils/generic_utils.py
index 4ab4e923..17f1c3d9 100644
--- a/TTS/speaker_encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@@ -3,60 +3,15 @@ import glob
 import os
 import random
 import re
-from multiprocessing import Manager
 
 import numpy as np
 from scipy import signal
 
-from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
-from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
+from TTS.encoder.models.lstm import LSTMSpeakerEncoder
+from TTS.encoder.models.resnet import ResNetSpeakerEncoder
 from TTS.utils.io import save_fsspec
 
 
-class Storage(object):
-    def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8):
-        # use multiprocessing for threading safe
-        self.storage = Manager().list()
-        self.maxsize = maxsize
-        self.num_speakers_in_batch = num_speakers_in_batch
-        self.num_threads = num_threads
-        self.ignore_last_batch = False
-
-        if storage_batchs >= 3:
-            self.ignore_last_batch = True
-
-        # used for fast random sample
-        self.safe_storage_size = self.maxsize - self.num_threads
-        if self.ignore_last_batch:
-            self.safe_storage_size -= self.num_speakers_in_batch
-
-    def __len__(self):
-        return len(self.storage)
-
-    def full(self):
-        return len(self.storage) >= self.maxsize
-
-    def append(self, item):
-        # if storage is full, remove an item
-        if self.full():
-            self.storage.pop(0)
-
-        self.storage.append(item)
-
-    def get_random_sample(self):
-        # safe storage size considering all threads remove one item from storage in same time
-        storage_size = len(self.storage) - self.num_threads
-
-        if self.ignore_last_batch:
-            storage_size -= self.num_speakers_in_batch
-
-        return self.storage[random.randint(0, storage_size)]
-
-    def get_random_sample_fast(self):
-        """Call this method only when storage is full"""
-        return self.storage[random.randint(0, self.safe_storage_size)]
-
-
 class AugmentWAV(object):
     def __init__(self, ap, augmentation_config):
 
@@ -209,7 +164,7 @@ def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_s
     save_fsspec(state, checkpoint_path)
 
 
-def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step):
+def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
     if model_loss < best_loss:
         new_state_dict = model.state_dict()
         state = {
@@ -217,6 +172,7 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path
             "optimizer": optimizer.state_dict(),
             "criterion": criterion.state_dict(),
             "step": current_step,
+            "epoch": epoch,
             "loss": model_loss,
             "date": datetime.date.today().strftime("%B %d, %Y"),
         }
diff --git a/TTS/speaker_encoder/utils/io.py b/TTS/encoder/utils/io.py
similarity index 100%
rename from TTS/speaker_encoder/utils/io.py
rename to TTS/encoder/utils/io.py
diff --git a/TTS/speaker_encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py
similarity index 100%
rename from TTS/speaker_encoder/utils/prepare_voxceleb.py
rename to TTS/encoder/utils/prepare_voxceleb.py
diff --git a/TTS/encoder/utils/samplers.py b/TTS/encoder/utils/samplers.py
new file mode 100644
index 00000000..947f5da0
--- /dev/null
+++ b/TTS/encoder/utils/samplers.py
@@ -0,0 +1,102 @@
+import random
+from torch.utils.data.sampler import Sampler, SubsetRandomSampler
+
+
+class SubsetSampler(Sampler):
+    """
+    Samples elements sequentially from a given list of indices.
+
+    Args:
+        indices (list): a sequence of indices
+    """
+
+    def __init__(self, indices):
+        super().__init__(indices)
+        self.indices = indices
+
+    def __iter__(self):
+        return (self.indices[i] for i in range(len(self.indices)))
+
+    def __len__(self):
+        return len(self.indices)
+
+
+class PerfectBatchSampler(Sampler):
+    """
+    Samples a mini-batch of indices for a balanced class batching
+
+    Args:
+        dataset_items(list): dataset items to sample from.
+        classes (list): list of classes of dataset_items to sample from.
+        batch_size (int): total number of samples to be sampled in a mini-batch.
+        num_gpus (int): number of GPU in the data parallel mode.
+        shuffle (bool): if True, samples randomly, otherwise samples sequentially.
+        drop_last (bool): if True, drops last incomplete batch.
+    """
+
+    def __init__(self, dataset_items, classes, batch_size, num_classes_in_batch, num_gpus=1, shuffle=True, drop_last=False, label_key="class_name"):
+        super().__init__(dataset_items)
+        assert batch_size % (num_classes_in_batch * num_gpus) == 0, (
+            'Batch size must be divisible by number of classes times the number of data parallel devices (if enabled).')
+
+        label_indices = {}
+        for idx, item in enumerate(dataset_items):
+            label = item[label_key]
+            if label not in label_indices.keys():
+                label_indices[label] = [idx]
+            else:
+                label_indices[label].append(idx)
+
+        if shuffle:
+            self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes]
+        else:
+            self._samplers = [SubsetSampler(label_indices[key]) for key in classes]
+
+        self._batch_size = batch_size
+        self._drop_last = drop_last
+        self._dp_devices = num_gpus
+        self._num_classes_in_batch = num_classes_in_batch
+
+    def __iter__(self):
+
+        batch = []
+        if self._num_classes_in_batch != len(self._samplers):
+            valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
+        else:
+            valid_samplers_idx = None
+
+        iters = [iter(s) for s in self._samplers]
+        done = False
+
+        while True:
+            b = []
+            for i, it in enumerate(iters):
+                if valid_samplers_idx is not None and i not in valid_samplers_idx:
+                    continue
+                idx = next(it, None)
+                if idx is None:
+                    done = True
+                    break
+                b.append(idx)
+            if done:
+                break
+            batch += b
+            if len(batch) == self._batch_size:
+                yield batch
+                batch = []
+                if valid_samplers_idx is not None:
+                    valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
+
+        if not self._drop_last:
+            if len(batch) > 0:
+                groups = len(batch) // self._num_classes_in_batch
+                if groups % self._dp_devices == 0:
+                    yield batch
+                else:
+                    batch = batch[:(groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
+                    if len(batch) > 0:
+                        yield batch
+
+    def __len__(self):
+        class_batch_size = self._batch_size // self._num_classes_in_batch
+        return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers)
diff --git a/TTS/speaker_encoder/utils/training.py b/TTS/encoder/utils/training.py
similarity index 100%
rename from TTS/speaker_encoder/utils/training.py
rename to TTS/encoder/utils/training.py
diff --git a/TTS/speaker_encoder/utils/visual.py b/TTS/encoder/utils/visual.py
similarity index 69%
rename from TTS/speaker_encoder/utils/visual.py
rename to TTS/encoder/utils/visual.py
index 4f40f68c..f2db2f3f 100644
--- a/TTS/speaker_encoder/utils/visual.py
+++ b/TTS/encoder/utils/visual.py
@@ -29,14 +29,18 @@ colormap = (
 )
 
 
-def plot_embeddings(embeddings, num_utter_per_speaker):
-    embeddings = embeddings[: 10 * num_utter_per_speaker]
+def plot_embeddings(embeddings, num_classes_in_batch):
+    num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
+
+    # if necessary get just the first 10 classes
+    if num_classes_in_batch > 10:
+        num_classes_in_batch = 10
+        embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
+
     model = umap.UMAP()
     projection = model.fit_transform(embeddings)
-    num_speakers = embeddings.shape[0] // num_utter_per_speaker
-    ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker)
+    ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
     colors = [colormap[i] for i in ground_truth]
-
     fig, ax = plt.subplots(figsize=(16, 10))
     _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
     plt.gca().set_aspect("equal", "datalim")
diff --git a/TTS/speaker_encoder/configs/config.json b/TTS/speaker_encoder/configs/config.json
deleted file mode 100644
index 30d83e51..00000000
--- a/TTS/speaker_encoder/configs/config.json
+++ /dev/null
@@ -1,118 +0,0 @@
-
-{
-    "model_name": "lstm",
-    "run_name": "mueller91",
-    "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
-    "audio":{
-        // Audio processing parameters
-        "num_mels": 40,         // size of the mel spec frame. 
-        "fft_size": 400,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 400,     // stft window length in ms.
-        "hop_length": 160,      // stft window hop-lengh in ms.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "min_level_db": -100,   // normalization range
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
-        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
-    },
-    "reinit_layers": [],
-    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
-    "grad_clip": 3.0, // upper limit for gradients for clipping.
-    "epochs": 1000, // total number of epochs to train.
-    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
-    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
-    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "steps_plot_stats": 10, // number of steps to plot embeddings.
-    "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "num_utters_per_speaker": 10,  //
-    "skip_speakers": false, // skip speakers with samples less than "num_utters_per_speaker"
-
-    "voice_len": 1.6, // number of seconds for each training instance
-    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "wd": 0.000001, // Weight decay weight.
-    "checkpoint": true, // If true, it saves checkpoints per "save_step"
-    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
-    "print_step": 20, // Number of steps to log traning on console.
-    "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
-    "model": {
-        "input_dim": 40,
-        "proj_dim": 256,
-        "lstm_dim": 768,
-        "num_lstm_layers": 3,
-        "use_lstm_with_projection": true
-    },
-
-    "audio_augmentation": {
-        "p": 0,
-        //add a gaussian noise to the data in order to increase robustness
-        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
-            "p": 1, // propability of apply this method, 0 is disable
-            "min_amplitude": 0.0,
-            "max_amplitude": 1e-5     
-        }
-    },
-    "storage": {
-        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
-        "storage_size": 15,   // the size of the in-memory storage with respect to a single batch
-        "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
-    },
-    "datasets": 
-        [
-            {
-                "name": "vctk_slim",
-                "path": "../../../audio-datasets/en/VCTK-Corpus/",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "libri_tts",
-                "path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "libri_tts",
-                "path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "libri_tts",
-                "path": "../../../audio-datasets/en/LibriTTS/train-other-500",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "voxceleb1",
-                "path": "../../../audio-datasets/en/voxceleb1/",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "voxceleb2",
-                "path": "../../../audio-datasets/en/voxceleb2/",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "common_voice",
-                "path": "../../../audio-datasets/en/MozillaCommonVoice",
-                "meta_file_train": "train.tsv",
-                "meta_file_val": "test.tsv"
-            }
-        ]
-}
\ No newline at end of file
diff --git a/TTS/speaker_encoder/configs/config_resnet_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_angleproto.json
deleted file mode 100644
index c26d29ce..00000000
--- a/TTS/speaker_encoder/configs/config_resnet_angleproto.json
+++ /dev/null
@@ -1,956 +0,0 @@
-{
-    "model": "speaker_encoder",
-    "run_name": "speaker_encoder",
-    "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
-    // AUDIO PARAMETERS
-    "audio":{
-        // Audio processing parameters
-        "num_mels": 80,         // size of the mel spec frame. 
-        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 16000,  // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 1024,     // stft window length in ms.
-        "hop_length": 256,      // stft window hop-lengh in ms.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "min_level_db": -100,   // normalization range
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-        "stft_pad_mode": "reflect",
-        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "spec_gain": 20.0, 
-        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
-        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored 
-    },
-    "reinit_layers": [],
-
-    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss 
-    "grad_clip": 3.0, // upper limit for gradients for clipping.
-    "max_train_step": 1000000, // total number of steps to train.
-    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
-    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
-    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "steps_plot_stats": 100, // number of steps to plot embeddings.
-
-    // Speakers config
-    "num_speakers_in_batch": 200, // Batch size for training.
-    "num_utters_per_speaker": 2,  //
-    "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker"
-    "voice_len": 2, // number of seconds for each training instance
-     
-    "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "wd": 0.000001, // Weight decay weight.
-    "checkpoint": true, // If true, it saves checkpoints per "save_step"
-    "save_step": 1000, // Number of training steps expected to save the best checkpoints in training.
-    "print_step": 50, // Number of steps to log traning on console.
-    "output_path": "../checkpoints/speaker_encoder/angleproto/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto/", // DATASET-RELATED: output path for all training outputs.
-
-    "audio_augmentation": {
-        "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation
-        "rir":{
-            "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
-            "conv_mode": "full"
-        },
-        "additive":{
-            "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
-            // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored
-            "speech":{
-                "min_snr_in_db": 13,
-                "max_snr_in_db": 20,
-                "min_num_noises": 2,
-                "max_num_noises": 3
-                },
-            "noise":{
-                "min_snr_in_db": 0,
-                "max_snr_in_db": 15,
-                "min_num_noises": 1,
-                "max_num_noises": 1
-                },
-            "music":{
-                "min_snr_in_db": 5,
-                "max_snr_in_db": 15,
-                "min_num_noises": 1,
-                "max_num_noises": 1
-                }
-        },
-        //add a gaussian noise to the data in order to increase robustness
-        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
-            "p": 0.5, // propability of apply this method, 0 is disable
-            "min_amplitude": 0.0,
-            "max_amplitude": 1e-5    
-        }
-    },
-    "model_params": {
-        "model_name": "resnet",
-        "input_dim": 80,
-        "proj_dim": 512
-    },
-    "storage": {
-        "sample_from_storage_p": 0.5,  // the probability with which we'll sample from the DataSet in-memory storage
-        "storage_size":  35 // the size of the in-memory storage with respect to a single batch
-    },
-    "datasets": 
-        [
-        {
-                "name": "voxceleb2",
-                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
-                "meta_file_train": null,
-                "meta_file_val": null
-        },
-        {
-                "name": "voxceleb1",
-                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/",
-                "meta_file_train": null,
-                "meta_file_val": null
-        },
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        }
-
-        ]
-}
\ No newline at end of file
diff --git a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json
deleted file mode 100644
index ccbd751a..00000000
--- a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json
+++ /dev/null
@@ -1,957 +0,0 @@
-
-{
-    "model": "speaker_encoder",
-    "run_name": "speaker_encoder",
-    "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
-    // AUDIO PARAMETERS
-    "audio":{
-        // Audio processing parameters
-        "num_mels": 80,         // size of the mel spec frame. 
-        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 16000,  // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 1024,     // stft window length in ms.
-        "hop_length": 256,      // stft window hop-lengh in ms.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "min_level_db": -100,   // normalization range
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-        "stft_pad_mode": "reflect",
-        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "spec_gain": 20.0, 
-        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
-        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored 
-    },
-    "reinit_layers": [],
-
-    "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss 
-    "grad_clip": 3.0, // upper limit for gradients for clipping.
-    "max_train_step": 1000000, // total number of steps to train.
-    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
-    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
-    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "steps_plot_stats": 100, // number of steps to plot embeddings.
-
-    // Speakers config
-    "num_speakers_in_batch": 200, // Batch size for training.
-    "num_utters_per_speaker": 2,  //
-    "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker"
-    "voice_len": 2, // number of seconds for each training instance
-     
-    "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "wd": 0.000001, // Weight decay weight.
-    "checkpoint": true, // If true, it saves checkpoints per "save_step"
-    "save_step": 1000, // Number of training steps expected to save the best checkpoints in training.
-    "print_step": 50, // Number of steps to log traning on console.
-    "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all/", // DATASET-RELATED: output path for all training outputs.
-
-    "audio_augmentation": {
-        "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation
-        "rir":{
-            "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
-            "conv_mode": "full"
-        },
-        "additive":{
-            "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
-            // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored
-            "speech":{
-                "min_snr_in_db": 13,
-                "max_snr_in_db": 20,
-                "min_num_noises": 2,
-                "max_num_noises": 3
-                },
-            "noise":{
-                "min_snr_in_db": 0,
-                "max_snr_in_db": 15,
-                "min_num_noises": 1,
-                "max_num_noises": 1
-                },
-            "music":{
-                "min_snr_in_db": 5,
-                "max_snr_in_db": 15,
-                "min_num_noises": 1,
-                "max_num_noises": 1
-                }
-        },
-        //add a gaussian noise to the data in order to increase robustness
-        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
-            "p": 0.5, // propability of apply this method, 0 is disable
-            "min_amplitude": 0.0,
-            "max_amplitude": 1e-5    
-        }
-    },
-    "model_params": {
-        "model_name": "resnet",
-        "input_dim": 80,
-        "proj_dim": 512
-    },
-    "storage": {
-        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
-        "storage_size":  35 // the size of the in-memory storage with respect to a single batch
-    },
-    "datasets": 
-        [
-        {
-                "name": "voxceleb2",
-                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
-                "meta_file_train": null,
-                "meta_file_val": null
-        },
-        {
-                "name": "voxceleb1",
-                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/",
-                "meta_file_train": null,
-                "meta_file_val": null
-        },
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        }
-
-        ]
-}
\ No newline at end of file
diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py
deleted file mode 100644
index 07fa9246..00000000
--- a/TTS/speaker_encoder/dataset.py
+++ /dev/null
@@ -1,243 +0,0 @@
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import Dataset
-
-from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage
-
-
-class SpeakerEncoderDataset(Dataset):
-    def __init__(
-        self,
-        ap,
-        meta_data,
-        voice_len=1.6,
-        num_speakers_in_batch=64,
-        storage_size=1,
-        sample_from_storage_p=0.5,
-        num_utter_per_speaker=10,
-        skip_speakers=False,
-        verbose=False,
-        augmentation_config=None,
-        use_torch_spec=None,
-    ):
-        """
-        Args:
-            ap (TTS.tts.utils.AudioProcessor): audio processor object.
-            meta_data (list): list of dataset instances.
-            seq_len (int): voice segment length in seconds.
-            verbose (bool): print diagnostic information.
-        """
-        super().__init__()
-        self.items = meta_data
-        self.sample_rate = ap.sample_rate
-        self.seq_len = int(voice_len * self.sample_rate)
-        self.num_speakers_in_batch = num_speakers_in_batch
-        self.num_utter_per_speaker = num_utter_per_speaker
-        self.skip_speakers = skip_speakers
-        self.ap = ap
-        self.verbose = verbose
-        self.use_torch_spec = use_torch_spec
-        self.__parse_items()
-        storage_max_size = storage_size * num_speakers_in_batch
-        self.storage = Storage(
-            maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch
-        )
-        self.sample_from_storage_p = float(sample_from_storage_p)
-
-        speakers_aux = list(self.speakers)
-        speakers_aux.sort()
-        self.speakerid_to_classid = {key: i for i, key in enumerate(speakers_aux)}
-
-        # Augmentation
-        self.augmentator = None
-        self.gaussian_augmentation_config = None
-        if augmentation_config:
-            self.data_augmentation_p = augmentation_config["p"]
-            if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
-                self.augmentator = AugmentWAV(ap, augmentation_config)
-
-            if "gaussian" in augmentation_config.keys():
-                self.gaussian_augmentation_config = augmentation_config["gaussian"]
-
-        if self.verbose:
-            print("\n > DataLoader initialization")
-            print(f" | > Speakers per Batch: {num_speakers_in_batch}")
-            print(f" | > Storage Size: {storage_max_size} instances, each with {num_utter_per_speaker} utters")
-            print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
-            print(f" | > Number of instances : {len(self.items)}")
-            print(f" | > Sequence length: {self.seq_len}")
-            print(f" | > Num speakers: {len(self.speakers)}")
-
-    def load_wav(self, filename):
-        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
-        return audio
-
-    def __parse_items(self):
-        self.speaker_to_utters = {}
-        for i in self.items:
-            path_ = i["audio_file"]
-            speaker_ = i["speaker_name"]
-            if speaker_ in self.speaker_to_utters.keys():
-                self.speaker_to_utters[speaker_].append(path_)
-            else:
-                self.speaker_to_utters[speaker_] = [
-                    path_,
-                ]
-
-        if self.skip_speakers:
-            self.speaker_to_utters = {
-                k: v for (k, v) in self.speaker_to_utters.items() if len(v) >= self.num_utter_per_speaker
-            }
-
-        self.speakers = [k for (k, v) in self.speaker_to_utters.items()]
-
-    def __len__(self):
-        return int(1e10)
-
-    def get_num_speakers(self):
-        return len(self.speakers)
-
-    def __sample_speaker(self, ignore_speakers=None):
-        speaker = random.sample(self.speakers, 1)[0]
-        # if list of speakers_id is provide make sure that it's will be ignored
-        if ignore_speakers and self.speakerid_to_classid[speaker] in ignore_speakers:
-            while True:
-                speaker = random.sample(self.speakers, 1)[0]
-                if self.speakerid_to_classid[speaker] not in ignore_speakers:
-                    break
-
-        if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]):
-            utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker)
-        else:
-            utters = random.sample(self.speaker_to_utters[speaker], self.num_utter_per_speaker)
-        return speaker, utters
-
-    def __sample_speaker_utterances(self, speaker):
-        """
-        Sample all M utterances for the given speaker.
-        """
-        wavs = []
-        labels = []
-        for _ in range(self.num_utter_per_speaker):
-            # TODO:dummy but works
-            while True:
-                # remove speakers that have num_utter less than 2
-                if len(self.speaker_to_utters[speaker]) > 1:
-                    utter = random.sample(self.speaker_to_utters[speaker], 1)[0]
-                else:
-                    if speaker in self.speakers:
-                        self.speakers.remove(speaker)
-
-                    speaker, _ = self.__sample_speaker()
-                    continue
-
-                wav = self.load_wav(utter)
-                if wav.shape[0] - self.seq_len > 0:
-                    break
-
-                if utter in self.speaker_to_utters[speaker]:
-                    self.speaker_to_utters[speaker].remove(utter)
-
-            if self.augmentator is not None and self.data_augmentation_p:
-                if random.random() < self.data_augmentation_p:
-                    wav = self.augmentator.apply_one(wav)
-
-            wavs.append(wav)
-            labels.append(self.speakerid_to_classid[speaker])
-        return wavs, labels
-
-    def __getitem__(self, idx):
-        speaker, _ = self.__sample_speaker()
-        speaker_id = self.speakerid_to_classid[speaker]
-        return speaker, speaker_id
-
-    def __load_from_disk_and_storage(self, speaker):
-        # don't sample from storage, but from HDD
-        wavs_, labels_ = self.__sample_speaker_utterances(speaker)
-        # put the newly loaded item into storage
-        self.storage.append((wavs_, labels_))
-        return wavs_, labels_
-
-    def collate_fn(self, batch):
-        # get the batch speaker_ids
-        batch = np.array(batch)
-        speakers_id_in_batch = set(batch[:, 1].astype(np.int32))
-
-        labels = []
-        feats = []
-        speakers = set()
-
-        for speaker, speaker_id in batch:
-            speaker_id = int(speaker_id)
-
-            # ensure that an speaker appears only once in the batch
-            if speaker_id in speakers:
-
-                # remove current speaker
-                if speaker_id in speakers_id_in_batch:
-                    speakers_id_in_batch.remove(speaker_id)
-
-                speaker, _ = self.__sample_speaker(ignore_speakers=speakers_id_in_batch)
-                speaker_id = self.speakerid_to_classid[speaker]
-                speakers_id_in_batch.add(speaker_id)
-
-            if random.random() < self.sample_from_storage_p and self.storage.full():
-                # sample from storage (if full)
-                wavs_, labels_ = self.storage.get_random_sample_fast()
-
-                # force choose the current speaker or other not in batch
-                # It's necessary for ideal training with AngleProto and GE2E losses
-                if labels_[0] in speakers_id_in_batch and labels_[0] != speaker_id:
-                    attempts = 0
-                    while True:
-                        wavs_, labels_ = self.storage.get_random_sample_fast()
-                        if labels_[0] == speaker_id or labels_[0] not in speakers_id_in_batch:
-                            break
-
-                        attempts += 1
-                        # Try 5 times after that load from disk
-                        if attempts >= 5:
-                            wavs_, labels_ = self.__load_from_disk_and_storage(speaker)
-                            break
-            else:
-                # don't sample from storage, but from HDD
-                wavs_, labels_ = self.__load_from_disk_and_storage(speaker)
-
-            # append speaker for control
-            speakers.add(labels_[0])
-
-            # remove current speaker and append other
-            if speaker_id in speakers_id_in_batch:
-                speakers_id_in_batch.remove(speaker_id)
-
-            speakers_id_in_batch.add(labels_[0])
-
-            # get a random subset of each of the wavs and extract mel spectrograms.
-            feats_ = []
-            for wav in wavs_:
-                offset = random.randint(0, wav.shape[0] - self.seq_len)
-                wav = wav[offset : offset + self.seq_len]
-                # add random gaussian noise
-                if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]:
-                    if random.random() < self.gaussian_augmentation_config["p"]:
-                        wav += np.random.normal(
-                            self.gaussian_augmentation_config["min_amplitude"],
-                            self.gaussian_augmentation_config["max_amplitude"],
-                            size=len(wav),
-                        )
-
-                if not self.use_torch_spec:
-                    mel = self.ap.melspectrogram(wav)
-                    feats_.append(torch.FloatTensor(mel))
-                else:
-                    feats_.append(torch.FloatTensor(wav))
-
-            labels.append(torch.LongTensor(labels_))
-            feats.extend(feats_)
-
-        feats = torch.stack(feats)
-        labels = torch.stack(labels)
-
-        return feats, labels
diff --git a/TTS/speaker_encoder/models/lstm.py b/TTS/speaker_encoder/models/lstm.py
deleted file mode 100644
index ec394cdb..00000000
--- a/TTS/speaker_encoder/models/lstm.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import numpy as np
-import torch
-import torchaudio
-from torch import nn
-
-from TTS.speaker_encoder.models.resnet import PreEmphasis
-from TTS.utils.io import load_fsspec
-
-
-class LSTMWithProjection(nn.Module):
-    def __init__(self, input_size, hidden_size, proj_size):
-        super().__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.proj_size = proj_size
-        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
-        self.linear = nn.Linear(hidden_size, proj_size, bias=False)
-
-    def forward(self, x):
-        self.lstm.flatten_parameters()
-        o, (_, _) = self.lstm(x)
-        return self.linear(o)
-
-
-class LSTMWithoutProjection(nn.Module):
-    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
-        super().__init__()
-        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
-        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
-        self.relu = nn.ReLU()
-
-    def forward(self, x):
-        _, (hidden, _) = self.lstm(x)
-        return self.relu(self.linear(hidden[-1]))
-
-
-class LSTMSpeakerEncoder(nn.Module):
-    def __init__(
-        self,
-        input_dim,
-        proj_dim=256,
-        lstm_dim=768,
-        num_lstm_layers=3,
-        use_lstm_with_projection=True,
-        use_torch_spec=False,
-        audio_config=None,
-    ):
-        super().__init__()
-        self.use_lstm_with_projection = use_lstm_with_projection
-        self.use_torch_spec = use_torch_spec
-        self.audio_config = audio_config
-        self.proj_dim = proj_dim
-
-        layers = []
-        # choise LSTM layer
-        if use_lstm_with_projection:
-            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
-            for _ in range(num_lstm_layers - 1):
-                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
-            self.layers = nn.Sequential(*layers)
-        else:
-            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
-
-        self.instancenorm = nn.InstanceNorm1d(input_dim)
-
-        if self.use_torch_spec:
-            self.torch_spec = torch.nn.Sequential(
-                PreEmphasis(audio_config["preemphasis"]),
-                # TorchSTFT(
-                #     n_fft=audio_config["fft_size"],
-                #     hop_length=audio_config["hop_length"],
-                #     win_length=audio_config["win_length"],
-                #     sample_rate=audio_config["sample_rate"],
-                #     window="hamming_window",
-                #     mel_fmin=0.0,
-                #     mel_fmax=None,
-                #     use_htk=True,
-                #     do_amp_to_db=False,
-                #     n_mels=audio_config["num_mels"],
-                #     power=2.0,
-                #     use_mel=True,
-                #     mel_norm=None,
-                # )
-                torchaudio.transforms.MelSpectrogram(
-                    sample_rate=audio_config["sample_rate"],
-                    n_fft=audio_config["fft_size"],
-                    win_length=audio_config["win_length"],
-                    hop_length=audio_config["hop_length"],
-                    window_fn=torch.hamming_window,
-                    n_mels=audio_config["num_mels"],
-                ),
-            )
-        else:
-            self.torch_spec = None
-
-        self._init_layers()
-
-    def _init_layers(self):
-        for name, param in self.layers.named_parameters():
-            if "bias" in name:
-                nn.init.constant_(param, 0.0)
-            elif "weight" in name:
-                nn.init.xavier_normal_(param)
-
-    def forward(self, x, l2_norm=True):
-        """Forward pass of the model.
-
-        Args:
-            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
-                to compute the spectrogram on-the-fly.
-            l2_norm (bool): Whether to L2-normalize the outputs.
-
-        Shapes:
-            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
-        """
-        with torch.no_grad():
-            with torch.cuda.amp.autocast(enabled=False):
-                if self.use_torch_spec:
-                    x.squeeze_(1)
-                    x = self.torch_spec(x)
-                x = self.instancenorm(x).transpose(1, 2)
-        d = self.layers(x)
-        if self.use_lstm_with_projection:
-            d = d[:, -1]
-        if l2_norm:
-            d = torch.nn.functional.normalize(d, p=2, dim=1)
-        return d
-
-    @torch.no_grad()
-    def inference(self, x, l2_norm=True):
-        d = self.forward(x, l2_norm=l2_norm)
-        return d
-
-    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
-        """
-        Generate embeddings for a batch of utterances
-        x: 1xTxD
-        """
-        max_len = x.shape[1]
-
-        if max_len < num_frames:
-            num_frames = max_len
-
-        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
-
-        frames_batch = []
-        for offset in offsets:
-            offset = int(offset)
-            end_offset = int(offset + num_frames)
-            frames = x[:, offset:end_offset]
-            frames_batch.append(frames)
-
-        frames_batch = torch.cat(frames_batch, dim=0)
-        embeddings = self.inference(frames_batch)
-
-        if return_mean:
-            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
-
-        return embeddings
-
-    def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
-        """
-        Generate embeddings for a batch of utterances
-        x: BxTxD
-        """
-        num_overlap = num_frames * overlap
-        max_len = x.shape[1]
-        embed = None
-        num_iters = seq_lens / (num_frames - num_overlap)
-        cur_iter = 0
-        for offset in range(0, max_len, num_frames - num_overlap):
-            cur_iter += 1
-            end_offset = min(x.shape[1], offset + num_frames)
-            frames = x[:, offset:end_offset]
-            if embed is None:
-                embed = self.inference(frames)
-            else:
-                embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :])
-        return embed / num_iters
-
-    # pylint: disable=unused-argument, redefined-builtin
-    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
-        self.load_state_dict(state["model"])
-        if use_cuda:
-            self.cuda()
-        if eval:
-            self.eval()
-            assert not self.training
diff --git a/TTS/speaker_encoder/umap.png b/TTS/speaker_encoder/umap.png
deleted file mode 100644
index ca8aefeac8cbe616983b35e968c9c9133eb41ede..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24616
zcma&OWmHvR*EYHd38hm)K^g(+PU#To?(UH8P`bNAx=TuGBhuZufh{229h*1{pZER7
z8Q*z+9L8n<8`fI)J@1*<ye8p_pQX@W62An2K<F~k;>sWp0s;sGcY^{CyptYL9146v
zb(Gd}27xfco<HFT28u#JAaan5_(xR_{eu-xA3d``?h~l<QI2>0vDH=KF8G)!m|%zo
zhVni_tl}HSUCdBeK@86scs*+DW^Dg^Kum!uHnw7%FGC!G_LA}rxKJ*maPc~`jd@$(
zm~4A%YfEvdSU!{E`uag^+%sqUtx$=4qCt+DH2p_$aq*-iJQFx<Y;2_|LmUbUiu}L5
zRKUC9gOM^}z()dMA{OvVYG#^uz%OO>%}u%%7MmRmsBq|+m^jsS|6cgM48rP*K#e%&
zucMxY(aa8iJ?5W`uDEW59DXjD*5s4yxmi?@Mme35W3JL|nbQ0c|0-BCqnMWz!=x9n
z_d0taVb2#KFd(0fD|1wn_2a28ZPS-a@R}nJKR-m%OIBF$W}Z;TT@(LYkdwReecC3A
zRy5Pt9M2vuykVoZii%2f5n-nzkH9<V{>BM#?O#tQ@RL#}eKsD{-Izr9c>mNRN`g!w
zF@t}{`~C(p-fS!U*RBrboH-U;OgFTCFqp2MT6d{MWbmK;W~w#@(Eb!AJ%pps3mmYC
zKBCojp9^kl*Om@km}w*vb-Pnna82fC!YEl$->KC}uFt(5`6sXq331nJn7fOYK8ECH
zP*=CF$3eg@c|)8;AO$&A{ABcDXu)dxV6|fk13UXHCxf7!m$r7(s5M`Omf|N-K;Rr7
zq|3a%A!9VGY@olHGhL`mgTeFW?7)wj2L{N5aYPm@r`uer+uKd*r}VEKuxK$AVe})^
zo9nBj<(dg*4vPKHO~hyP6?VRJ`)DR&gVa^jTYY7#D=H=2Bu!nMth;`U5Lr#`bzWR}
zD<h7)44Npk#XvH!a~Q9O%>CUa%Jo1>3j)sEQWr%M;Yh@Qg|sOH?w9qD<Kj@IdMeH?
zxXy8Hs!}T><q1~sX2o0%4+kDZg*DkQXIRUCNBmgJ{vXZR5an3EhQVMD12VxBhRRb@
z2WBciUKR7dug>PEnbMj&(pVBrfh)TV6-9^-<0syIC3Q!sL|^39)CSLr3h3J8LLzqV
zqWAA5$af^=Zj9W~O8VdF8;k}jY_5{(GA#`A<(jlb;x=q5{0LoQ58wiXK(E-RpbPF0
zcs5Jaknh$k(LU%z|Fkxmv+TEC>e7X^=LUiM+{N9WkMZm5>B^D1i58p8vCTsb+U$YK
zq21dGRwyq-AK*fTvyRX&F3O|birVNBwmD*5$PVKzXrGNjiQ3!U2ANEq<Y_MgkpO9A
zS~r(H(h1$lO%7Z~4>EL0Sxl$E2+o%mMI^lnSgC8#fcdTjFk9PtYX9e0`WhZEhsLf=
zYo$uz+G;#AXtfi)boB4!&8fQ=@IY}U+*Nc=x+=-U*m33(n(~6mGj73Te1EvO1mFC#
zJ7SwId+~?3@}&0^4`nNDvZ9NgL#Fi!=C-IH7P%@$V(plTD1p#f#vlsO4BIOy1G_7t
z%p8pW$bePJktoZIrAV5-h+uXZS()#;_$TDk(x66AnGxsI!p6e%Pia((g-<kz6hSPn
z3biFkZTp=1F77V({jb}gL~`L_VRVCZ>|h$TKtt)P!%(f?rc)U?-v%oSHNr3G`{qQb
zumr&cW>2}oj5;bxU`Kt$jNjO<vLTzt#aZo`yLf$UsBoH%)@t{yO-)>W@_?qxh}XSO
zJf8SUuz1Bl{~&ArAq&G&`Ra(z3g_$^CI*%scvI$Yu3!{eW4AgIF?ido!IQcPiI!Ke
zU`(7?YP7+1IPQFU?9qAlM+Bne)P!a1oQ#T@SMMKv;*)CzD0)c)7ig914dt+#@T4X7
zFATS_YZ~rW;H2|P32X&_P|(^vHZ<JwYGF(dq)eFD0#Z^+IJbiy>9#Lu$|fZR87D-@
zS*L5qzDIb-*C=Pw|57RcQwR^E#A=I8<;rxCqM+tL)Vtn&DCw67q9`3rq}eZFYP>A#
z2y;y-0U4C%^ESzc5n^86k3U&<eWw-{zFu`GhB|($d!dA*jE6!m=0|i^P=gLaq$1bV
zQZJN=ECwIP+}{cok?+Y2zPvF;Frs&}NMYzTtEUp$9eFkhJsu|Leew2avndan{Ind!
zn>TjQhwYhRW{s?|HM{VHm|A`bFATgFOMfz0<*g%T`8*p1QgHn`KeSeTPQ#9W=P)lq
zPDM=)svk_qNIsGl9QOZcF&zt>ghd{(Q1tfu2sES(Rys+w4qY};#I%<~jLsi8@zs9Y
z29{fO)e_gD@QQgeaGu#_w{G*0b8@a9?;TYgpSgL#p}=Xxu$R2&Z~SHEvz%^}i-kQI
z{AZ=1?08W4qwEeYA1;?nd_YvW*tNLK@E(iWXr2Ef%zFA=RgKR6b!orP-oD=`gyG?x
zxEd&fAmSxwJ)_u`yz)qK5}Nm7enNhFFhNWVHg>cQLJfH7EZQ3+ErK>K^78w_;TS$c
zh-`R5N`BZ3<9XEPP@(nixwOAx#Zl}{tRz$j<c}pP-PWrbm+EymMIT8JTJMCPIJnpF
zSJfP%+q7`}nzEltT-;^V6Y8UqJ7IXE&jfN-QaTcZvTU1T*1VH7j>tE~C$9hnWZ|i@
zy~2Je)l4=m@X<^;<aUVuin(&={TOaa{Q)A|bHL;~{wfuleQ~X3#D*_?V{nzu9*r-K
z2*Uc%X7w@nfx+f$hH9o4BSQ%f*!`95%O060nQU~xrt{Co^!4!hJSG$^D{`_Lk3RR2
z=d0kFCt7_;KEGD$i%|OAXd*7YoxLByfJPl75&7Q0M`$B4mm>*x@<pG}^YtGNGi*L}
z%AC*4l?$#km6u;+Iy8OCDAc&+azM?sOo$J2zID<`K5f<?-3V`Jgsa6Pcwbf)qD{f)
z{;Z;eU!8lMwj(w}e&1xUiGY-ONO=wJjmMtu_N1)g45!8)4VF0PdmO?oRUKWT!Ynyj
zp3YK#jb-;uC7!lFdcivH4NpG9#|4U9*Z6Jvt-a_d-S2!$T8^;fe-q#*CteWd3p}Hs
z0MIU1;j=K7bTca3s&XeLovn5!`E|Go@F4`D%}s|^UbgKB7qmPsP=SHAQ?b9%Al;+U
zrX(D~IJI0!xQ9NIlk=zR+m+RHh&JvdFUc^K>hQ{DxF!6q|D6?6`Aao?k&~Utlkno4
z{<m^^9j!O<p>U*nuAYSzxEV<FE8_wxXPHl@c94+7Lzco9-|F_0@OF>pGj{^%h1$Y)
z@|*6N$o?QGy6cqH9eo9EJ~J^<Yrn#0LXSNuL8od!w!1UY%rmJHi53j!vFZmo?;~L&
zjcI7!WupGG(#~6JLs+Sfcpt7M4Z=BTzcs&Remit)VBq=z$l!j;gouHH@_AAjS&NE#
zNpL4iUk`hV+RRrcqejgFvd1)HBZ_p1z_GX9^K|iT7XkfVJLza!Y;nx1T@2Nx$&#46
zj5N3qzE!<BF)GCdi7M%)Rx|ouG^5&uTe?J<0a};A!WbMpmw=P@sSmnW$YEh_UZcZb
z$%y5s|0Dk<&%o4gk4w%duU0c2>)VA;evZKt-^>D?8hXG?gQM7np&)iN&9e?Fkyp+o
zNWC4rAF;@KMSw!Rrc;rW@;kp_znXn}`&FWr0`FLnoG%0_{NwIywg1tusxrYE5A~a=
z7u8E{ZdxJgsVO!WDe(de#%;h!$y3ekm47{kB9kvqOgvrWc4Trx-oIC_0lzm}R0bt-
z&s_<_Dz`7^^%&GO87eZsCog=};~dWU$<n#dL0A)}wp&=_cqP*d0b!B|fKQ&Jt4~SY
z_T67?cUZBZ8~?eP5~X7gH}N#1!jfKC{*wHQRhl0z2@MG;6K()Ux=<?vC3f`7=S{tm
zmsDf@`B^+xC?v1$b>g75ni)Nv<4DdMs7FSa<a0VZ8Ej7fJ|UynfcA4zj1)zgL51~L
z>XB^Q<6RexxXjwMf`IEKeg$0n)fNOK<hgNqn4O*3Y>(^+t7=aBR1AK6Y^Chy<7;ad
zGSJqx{;0p2v{xP#<<jjuhV}2}TiUK#eN4?4!Z2H7BSxI`5ygVv#viGSnLacrY{pC_
z75-pjzYzHAxIl|furqI7@421h>nar^NrByw4iWt0hRMN^oyKdo7iFb_lj|7roRam4
ztaUzA_|A<qJ)L*o{M39D_8oTZ#ZUSDdm^54wb^DgF+$mVv`L*rp@cT$<m}w3N<FgI
zcyN4pR_?&9ZN*+g&2{+dI(C*DBrWHfj7P{%%9T3~U7p%Iy%Sn!gdATpr*TWk=(c<Q
zO91;Mw=9G+-5xIQ&OC+-WlkRo4%y8YEJj|>CDEnttO%3M&MYhYw`2b*%>ONy@}X(b
z*6X%%cN)5QrmrKl9e=k$-JAIG>^#Kxz(q186UZm=(t#v!x_GffeuI@*{7a|*dRffa
z2NU{<2^Ue>T_-+fojS(1-#`n8O=fZgki#oW4sGExLFx48WORpOVr3fPJGt<<Emw?X
z5zj$7(Jn|E0zp?R`JZ%CT1}#*Tj)SqNRNjU?qu>cqKFb+;vgdVYc!($&)Zb7lt7|D
z%)pTHfr|hR#Ase$Uans=uL&n!pd^T}Xsd3rvRkY0T<Y}2$7a$dB`2eP@2IaRsGnAx
zU0Pk8hhma5b2lv?{0qc@XZYUWk$^=(9Bp6fyKVXKr9%-OhB<4u)UKwcCZSY5G#ui;
zbJ6GIV*7A+*$G=Yjw%AeY*NqGNX8o9Vtltyu0f}w+GWQf|J4EH*pcV{F<_jGfjM0@
zDVO^P^~Gxl6{6v6x!URe($dlm=X2pz!ppAkBUgPT-ozV|Gzuz9-yziL&O!oL%E;Vi
zx|nctm=3tE2Dihd9tiBXQ}{Fm*hxa1-;t)zH!IL3dNjY=5xgN=dJ3cNcPneLY1C$K
zaxAoB9#SHrUA0X!vm$D-*w^88|FTr^67r&hor9%z*S4f7Q?@Gomi*FMkjUuH8}V+U
zX<#25F^n0)+?=yT2JUXI4$JKlvZ#h#%e=uaZo4xuwlNV`pBz)`E?oXoM39zMS5COA
zU)<d{9@}CB{}v5vsW<;zRYpMJXMQRZB%1flH8i@Yq#KCD;^G?vxc+`k^%W!{oscSP
z;s7fZ8}m=O*0$WTJpVbHg7y60!ymNf7H0(=t{6>L22IUP?+*`i*-~;aVE>91Qmqar
z2}!S3KRs1N#}HPOfcb52SoUb&)bw*+0;Ca?A5y2uPC_ZyR8uRUhe$y6m`zWA=Sj?!
z$<(_xQKLeNQ88QI{Dn6=Uiz=N^YBS%_FKT7pD$!?KjRCyZ(pvc(X2AiP)n-OY}M?{
z;VLqpyl=U?%)fQ6U|_h|pJmYasY|<<e%J6n0+7ktcm6PM5qAE(K042T#f{RL@qeqb
z=QXR}R8&=Q5&+M`))O&F4<aTeo~zQaV5A1Lw^b2HqO0`UfKzTfg{Y9R$N?h5?M@la
zUR+%4n_9a%gjg{0rKhu75JyV>@8hD){%xB&Z`0uxL>ABrk&Sl!Hr7$UjaK(hr2lti
z|4#*y+W)Qn|FhVKe<Hq5Q1)tHJ<3^)8LKvA*?EPSMNiLF!6_`vP0PSAyQ!Syyf<g`
zZ+l;cw$I5*EB1HyW5p>>MQg^lFK@l>?idN2&As|A%2br~#)2@TN9bttIv+1<Foo}H
z7fHQs)zoIk{U6WQ<e$y)vzY86WWLjRd{u-6MFeH77Q#djIapX)P7p1L&}B%4b9iv{
z)&t7!a8-FwIa1I6%Oxo>ZhpUGJ%R!JaOgn7@O+aoO$P%gjNe^?LJ72MWi?&$Nye3z
ziJ)E)PkwsFk^>9v4WG}?`Zn6vFYTTjx4-h%pPxHLEAS$0b2MBjy^ZbPboy5Sg^i^_
z$rf5H0N6tfuPDFLr8hx>cQKtSQ%wmEE)B2Oy0N&V=tG9EzxJqU<YHy{Ud{t@yHM`w
zM+?IwSdhS51v$ClRPaU7>gwv**;(5$xUIZ=XyJK*J0_MHO~-%U`6^a6cuYl=sazP<
z<g9`><mK}e2h1B&Qd71}E$Aakt1A^h4LyEy?NYQne*7j!Fh;hExTo}4l#`Pa(1GS<
zr^xADpZnAPXP;AT&?!Vs-8*4>xi<@1f;@lV4j;E{=UiM|<UZYR7Zs6t^BhV@sj10N
zdwF?9;p09-C35B0(c%>4<ssch{{zhQjl;^ztr>~OHZCu(F5hv{sZ$1_Ac;xs?ep`s
zO-)Z{(T`bjwAvhlzuvd+ML4ys=xeFZ&Cf%ZRws|$uG^4-I2WZLPz0=!8|veH@3qqE
zIxcv<-1QBakWn*k0uiKMrTxNizFP0aix;QA0gty>Z;_gutW~96F-y$cxezS+j($_z
zt3YJ~_SshE5<F?Cs+2_KsRh71t*7T;KtP%tx)vH5RK)nApK{rHP1)cU{CDEK4GkC9
zgE2zIutgK6j;|ejpS~&5H5eX|f~Ku$++DB5^Cx4ft64vbhGM*VPe(T}FaRPin&CJs
z^xqM_>B5YX2*5H1mTEIs$*Qh<|9j2R+Pb{XWt~{#c>idR=k8IRDEy#m1?vp<ev7H~
z?zq$3*2TVW7N!pxJO^N_1?xtd!T(@j_1BjquW(#kuR)GH2X{+1)qv$ZKAJ@m@F9n$
z;l0!Sy#1s16%_L5r{G}Vrd~X1?%bm5`|u#x(GecYhg*;p^w$6Wi-gj%HtST)3E!__
z?(Xga2i_+sUa&U`6>$3daQ(@A_<*50$Msf?ezy*alyP<F2XpT{5}Kq7ipxL(#lg7k
zPh8wry`TuFnB)}sUrc}P!x=Veju5a1+=YaUTtvl=HOPG70gfv6jc`tOvK3pp^1+qR
zA4VGaSX=c;s}=9#4)u!5b3kt_gwE4jTU-4f=KUKNZJ!shy_kES>+8_Dw!i;oXK&%`
zf()wtWYf`J@HoIO{P@vb<=ZMR?}Dzhyv=eO11ZmJ4J8mmLh_aH0kkgy3j_qeUgNnM
zT-0}r1d-!pkwSO+A$Jy1rhv%++SPg8aSRyWSnT!Veg7S%JE-n^LD^(06Nya<V}+L2
z;1RdJ5~WgyE_{f;NKVet!Fo75LvENQMO~WwnzZV4VUjY6QN51XT5YivnI0aO$sS&A
zr+BA2Cw{=9E%DqsT9gwG1z*i7?3}EPghZ8z4;Eq5EPU-UoQKdy@T+Sy(>g_eYI+%u
znYD*Ja^>m*vlIDUIRPX1ton(wJpsrT(uOzp4)<dv3zXCv-^)r^ErP{d0}WY(6zD!H
zCCzl{p>2=ma01S0?b0GhChP+-GFM0A=B_Y&+^<tTGHdI~;r?IKvvA~{PEPx4V&R*o
zIeF47c&H)dbr#(fu9@6SKOeWun*F%ysDTKSx14pb3Vsw}d5wZtNg!wR(a*&B+#BM~
zXz*jesF@^YCQLlP27F5pAaTBp9~CNC96pwNm(9=e+OokCk1g-RBc>QdTLap!EQT8S
zQoJ&)#a!<{CAqNqCqm{K6D7Jo(5sO|v0&hE=FXg*0XJJ-9vVYBD`!5Q2B<`f=l&I@
z`|+ZG?NI|iPBaQEXeWPieoA1WhTUf?7F38>h$jkCJ10aK`0(LjE0NjWcWW?aLn?LI
zMdza{dO+Mb10J;zJSPYx(dt;IX0-KwuSfJbBP;u$VeX}E;jUnzaXoprF`JA}kI~=0
zSiiKyPfVQI>g6RTM^n_Qdlzpy*>wu17QIJJFg);MbU+(7`kjSalVv3`2u1J;D8EE^
zR>X|g8o854(=}M#jqWMsp0=h>Z9rTl6)sx`eS&Q`uQ=6)D@s2z0_6c~A>9xVfMCsb
zm{Jr{d$9}=%3AtJ-)qS`YsrOIKat@@!W^&lAJ#)`s%064nPOrlB`BHrEp{YhKTe2Y
zgZ`?iR<)j2_q@JP5*(F(p{6qB6oO8*t){-IapEEX7#P*;W){D9@nE5N!Q?~e-oNx3
zGwSHOCh4=>>b8!-t=FUT9BPr#QK&E2nhf#>ZS_vVGjT}#RTZ440ki26(MVmiM))%{
z6chASCDnj$yL>-NK0M5_a&z-9V;aB3I2fTzv9z(HprQ?kPfL|t6|*=PQUs&9ZI+J$
zpmO3_?{DP{3~cO@Zet-afacLeI)GirL*^e49zbUd@9&nhy8{D^*)o|3+_!6|eQ>iE
zI-5=VDpwrZ4-OAsV4;VC5Cw7^?$?M+RDfe53&Of>X&EM9Uao%%fCyrROTz9}!qG31
zjqjGpLFq(97R_!^&re|xdT4(B)RRXisIcpo^FJ)D2@NVPf7j^`{2+L7MC0!6ZFHHW
z<_~kfO-_Wf_1nX2)Ay#|xzs#xiyW<{zN(b98xPU2sA(}T1=5W^qSWaYm$k2c*COr?
z4jLNnEO=+g@bnm=bt}AvIQfwZ_Ex+BV<9F1q@?s>R9HKg+z32h9HF^nH;=TP*WF*s
z2O&AWvIg`7s7RD~it8Ed@@llq`UGu&4xGv=B~^|@0k4d5S(7)0b!k$35=#Ke{z3gT
zQ5{Yiq`b!~C4%U?`al9bu&yM0ou!E`g3(7KLYMD&#v!Hc_3Qu=a1*igD$j-)Vmmt3
zI@N>X0wy%#;Xro~opXi?>V}@G&|=2U8Qc0R4o<zZ^iHNsLGINw;4*^e%PuI{mSSck
zDC6_Xrkfu&6#~G+6raqUcSjau8aT>i0v9BsuJE?kNuG*33w(#B<X(RZ3zOzfk+B5I
ziv77N_7t4#x}NjjpCWz$E=5$7HGRg?WqM;Dzn;Fi^0#SI6In$?S{r#y{$Aa&j~Ht@
zdC=(E&#ek*WtMySXef}35&<Z}duC>T*kSWVbA?9D=Ta)-bim!XU1iuP96-;fzCGoo
zS}HZRm7Ruog9SYYUxH$NZZSZ2XLSMn_4~8i&$XE)fX9}rRYXaMZ~sb<C%W*IH;*Ka
z-l0~u_J;bIoF(9AqhCHgKHg-#L**b_&OK2A09$1NyKgi}QTL+svw7juBI-)40a5qx
zRG+d5fbRztfm%&g-DHtgoMSJ-x|nZ@&BwRnuF9BAUZ;gPU^ZU4(QvR1vuSrzRH?W|
z8}|thNS38_KDe7(SdhqtLYf_N>vl}d$Fi&#iQmz@%TTP0NrD`C|H<y3=YF%GqbPX$
zW4tOdx7zDm*w=r0Besc*-`btGgi%jR$YS**76*Hh$LR=r8;uop(!<>a=KZm=1~80^
zWuNDIgD`dV{?MN_1s?R_fpq>o@W(8lL^=Fs?K~}jTivZZOiI80g?{Z*V!lfa1V;e8
z3t%s{x?eh-Vn<lX_CkX{Im&%xY0m+2K3(7eJhRZrIll|(dv$mOy+hPX;DQ1`1N+JV
z*+h=M?Zgl0jZaWi0DSVxv>?m-4EIBH0&hk%)LvBqyS@`IpeNznOEm5B&O^14mXyhT
zAqu~w;p~c^RzAJy`f&c6zJZY|x`rRnO(`|O$v;1G_TGaYd=U#vuTJu(8||QHo?ckr
zP#ZJQ<@K+tO9IX^i3N3X_wXF4>B5si#r6f4T=+_cJr~>#n$%~+?pIS;eRJKTD?d5~
z@m?TOU7_C`3or5IQvr*1(FZnTb>EQpP|^>1fyF-}_ebScD-+hi)Ws6OQNK$zeD(z9
za9FTFTJdja6vPWe3<6Ohwo*=}wk_ul$a8Z61rG76PoggGSK@aRVw({}_dZI)aR|)5
z<)3{Oz+W!p*nSv(wYp`<Gi-e8@^LQJIYiw<aB>b|2RT<3MMM;yzWC;2DX@?dpaAuP
z-H~K`c5!p_^7XTbpk(H5&adPE`fb#`Mz(^YKvgwYV*p)Puw*PcE#!TTME=fXO+ub$
z{0)d4F+Zd7JQ%baiS$&TcUVh%xx@iuS_H8OGi@H`062HgUDDKOv6_}uxI&nBmG^g6
zEO^DOTD@%NGcKUMw~cw@u;6|&@hd=TXyH%LI2q6C!pwvV1rw+1(o9B1`OsSK>vW*V
z23gzKL`!hw6sF5dqX~zsW-ZMB{gaGEeBu+kefxHtHSLIr`DYu4LndTfgR?^^xcjOg
zL*{gx>P}DYx*3x@GZ=)Ge6||#)qD6=RP=LShO|l;Y}X%}QCw71oPtM)k55ooRP5CO
zPHAew9j!VF3_Ro)QdM&W%90J*@dZtqyhZ?p)NAANW4OpcAXbPtoMQ{U6erJztLtL&
z?TdSKzdZFfiEX}G*3oygGT&3Mj_{wJn#Mu%t#3CxEF?8)``Br}8zQc1jRlV*EVv4G
zQ~n%d#@HzYgeP0h<|ex*!;<py#ESRZqkuQ*Z88Q@%QGkEVukD<-p7w(p|L(cZ4}C<
zwFJs6Z%W4T{HG?$G=n{C-8LO`TAMFC+eESHO;x{X%o^jCZp06W<TK$-GCME~v{sSZ
zcwEYGybui#s^hyF*)sH)NmMkbJr3SrZr6QsDywi&8!T5)_4~c>6$l*Bcx?{}MUnUt
z007pg$rz%YIm_pNWGg=%Ls|i?b9T4?++@e{K6-%esW30Rgr@g#W3cxW(*5{HNa4=|
z9JGLG1w?Mf2=3vFG*&k&>Z@_3{kq2Oh4e|JrQ2=M&`yp;V9@?WoEcZMP2EXhq@_n*
zJ=w3{E*5#CH<*78D!uLWI@<JQTROd;7|?4tyoir;oSW;OaRf{)t#mD|k|mE`TQ;UF
z?vna8$jb52(QdY?s-^800D7}D<s^S0`M+EMYvlt@`YVOM#ZP%fv|l~YL|e6PRK<%B
z4V8L?H85aK_rJj#%<e6tqNO-EXEsH*$5-Bb-IH5}3wd+~OruYkUq=jbcOs-w;TU>U
zQy)SwF?E8l;*7!##_2lWF;kufTCAnP(%ybsr;bL5e)c~cF+Y)!t`7KF09jcK4Ps^|
zDL1*hHMzU&I&)vW<yn}WPK?jd*Vi_-^^yhbmFpnF$ttCyX}-c*nQWxBg6g<6>!Q=`
zO5%5X%~tCVl~AywZ~2u|DhbYH*XIJ{Og?fDIu?rM(l`0T&UCGJ0SMrLq_j1s_T^0^
zETT_LPd<gWXo(YGd8Rg~z}haVdFs!t$`93A0vv~44)gknED~lM`cs?75eB%}lQ(OZ
zUff`qekbG*6S4ZIZPB7n!cU5zfx#g@Rrf*w+?iOMeS;QKt*K^j>&{2($uXIJGx>MD
z*-Mbw{R$#KwA_H_TQ<*;+P!fa)V<b|5}UQ-LMJOSZK1cnB8{Wmb4BzOMef$c``CY+
zF4q5VXJ#7??K{`t0$6)sdiAs|@nmg@_XgQMNc#Kv&99I?1n*Vr59Ncl{L6$GWs%he
z_Bh3K`5<z`uU6J`s@=GNcOALCe~iv--n%U>sNb)WKd#nny(ttFBZxHHJS);^I<~as
zi%(AX9GvPc`;b~UxeGXBt@rK=V3xtF;8+6;g6rG-tXxi)9vU2U`S(+(pt!KAdNNhp
zdQwcZK6=zpEnYh`_&-N4>1guGwHZyVeiU>~Px63JrQ1&-56%|4t)sUg-3KV*2-8l*
zE36L~8<fDBDxa;Dy?5<Y`}dkOP=C%ETiIIYXtQf6tqb#qq>XNEMiNR=#`nJ3Zp$j+
z1??541?{vCH)qrg$$NL$kiSQUdEyXeqoz^W54_>Jxh(ry8T#49OAQ@|p61lOw`5%B
zix*mio~Wp-h%{W;(5-iMu0yNQ(+v0gh_>f&c?GVp1c3YX>?JdRY=IKJTPhZRYkNGG
zAF^P~Q@_~mbjvop_<>t!I2Jp4z`;V*c8AELvazkl2(o+S%kqctPL@3|{j*f{?7eI=
z|NBkaa}ygxux1uNJLhK07`Qa4nb5R<J|kP(t6EG&_(!V&3Vl()1bJ|fBCGGyW)<ud
ze0`I@&k!Q?30Q=Tndf4>$)!9K6MM0I<6>ps-bdvjMIJ_wyb7KKt9mXR&AV2SIn$D=
z!I<p4yqeP74CC?Iw3>%MCvXlTxE@6i-x;zyHwn1scsF#=z<><N%{yD<tvF6=eF*l-
z<!2->1QiBFUE-jkgmv*kepPd4{6qDtqAnVuqAni&uB$s>1S8odT$xc!AT>4jqeb^W
zih9=N`xy-C{G=thQm0B5i8*0oO5rPa!7{wV?0Qeld4h#j#GvrF@`pX~lDb_K`OZSi
zLxm45j|}zM>FU0^BMZigJcUi9M?kU&SPTGnZjWTaeLotQ#B@Il7Y|W?+*4Is>2QAv
z6mt~~^Q^bX)<c7b1zFm5%*^KUByDOmGWo7<rc>tJIw*4SZ};g+d3Oht@GEL2Pp%v+
z-uZM8L|-Gr1<aafs5+OUw*d-en@pgf&EhZ?+IBVbhIeXlMNL~-H!e4I&&A945r7Q)
zaA|=6jW3bT?%XGRjV0SdC2q2tk#(WbwsH!=OLRJ7l}^>#8DWfKqM%F9;XMDbZZ}Q_
zX|7XKyX2io^ETV9#RQr+H|)N$(Y>~-ZBK><oo&rEP{sXEGU-BgLoKA8Km&+a=SisO
z?}|{30RRD*zdQOc$lmmtnjmvG0b;#npy#(P;xIudGwzx+{<h1D<GBt6!qux+XKmhI
zB*d9sUN9fOPIhW)28R0f1}biT$R=NA@_9Jmn;AE?Mx~guFGVf<b?B6zV?X;3`YFof
z1oP}4E|&9F08C5wGwXBewC4vg!;)XX-j2}}rpSROxSF)MJ@*4B0_!rKAwL0<VWB(#
z2W*D=@%y!TUG4?R0w8bxx&8mir-5{aa6<FB`3-~(rhgO=S^fVC&=L<n_ld>DX&W7&
zLMD@gBU!xDnjE5^M3cn*mL?tpA%yWaK1Or$w}eOJcT1~*(58#_)kFP^wY&8~(xWnY
zaT)tt9id;y-;bJN1<_#3lw(B3IQgbut{?jdTki_u;uhAfI)wjt8#>k1O?pmX$1MAP
zFJs0D?wRw0&iNgm&CQUh;*ZJ5vjTZs{;QbtAUdt~>Fsb?R8-JMdGE@``Gxg+kCysQ
zVVoBrI1yPxx#M4r{4utBm#?(}tqks`5+5`cQG1ss-p1JV4J4W|hlN5!l8vmhB9!h}
zo)d>7Y8huxV?+ENxI63bNmf|`eEO!jzNa6s)feB>*;MLl7~DrVx%e<2^BA8R?cEh+
zL2<wS7<Joq4WtMipMBCWHUZA%*8;8h;5vJmJdlNdFZInkq9s8l4&~-$Hg6^6{%f`*
z|MEL#+^%WU?8s{%3~r?Xn=p;gXI&seATO8DxA9=gdlb04ZI}v?vmjioykn#WiP9N-
z7<)5k`7=(KZvvO&*}^P0c3#VC23i_!Omf3MXDME}eadZHR<(^l#s=a10C%pEoVGF&
zmYQsJZ%!Xq@z)hWb=GPhR3y6S=VCf;&R3#g-XJ`MQQe1IE33t;o~6DHLFA@}BJ)Mr
zVzcDYk<XQYio_EGy&k`ZyS~0xtO6fpc9J)0e6F*P%avbzH;pJeZdb}sdpC_#5Eq2C
zUbMk{c(8PPk`Y=lRe>Yy{dn&p8(z0VY5k<>zOHDir*obMmiI4jd#rF?Z)6s1FWD8T
zA(+LJIPg2~nROqn)Q=;q0fheiM8xk+O>>&!9K?R@5B<*~q(4(kFB$f=EcJ3ssvMoa
z>gl3A((!^S&T85$&li-5WotxxdsDodHD-r9xj-VhzLb`SpWs0KzAJ+H+#zjw{hy8%
zkc;*?;BGr&Wk+tu2+os`HtEIDm^ML{l39Hg+XIr$9HDDQ!5e{}vUJdUsX!Nvw-^q`
zEa^$51aclAFC8DTL%`|$1zIE6*_i#y{cl%qYwPe%AhYX9GNX?vc@(H1H6lVw%L6pE
zks;pN7PV;WJG;}F5lnr%<)_b)&^PKMqLDNst~fiEj-7W<?`>UuWzA7sfiTPbQy9Wq
zphxJ%2DFUQ&*~DH6{2OrXfl|XbhLe|#IE7v0(xAF%?w%&Es(u1d^SgGE)c!eFEvv1
zcxOB=NFTmCLk#wjy}p$#HkHXm68PnhEUcU8dU;2ea8J06FmY<}QF`~%R*=CSz3)8I
zqhufKf+z}$k5ifUgb(<>)E-~?YaI-casl3ACx7c2nQO*=Z{MAS;WG`>Y@_*QQCW4p
z797ww!yOGNMi0TkO)A<w@BDFP0V=jo@P91cwo#Y?d))Ufw$w9cyX+n;FNR!ge;^G5
zU+pi@4;_vDYTAx9`^!J(BRcMis#1t)Z^fQ2&WH*JfJ0METtzL_m>?=DEDZ50J8Lbk
z>6^V}faF5>wg&LDzM~nlx1J?S&WIuaX2;;rJ_|iI$C0HmBICm(FTktzF;SapvzgO#
zcVB+fsrFsKb3;Xit)xe8%GQco9qM6n8GV|dw7rXNT+1!n{1AKx<}w((sQfx~5fp;r
zK?w#MS~qp4IGy=NrQg!W*QVUiwM{QC?qJ^$nX}6JTaf#SiE|ibaUsDC43DxBME4AK
z-cAKifC;l<Yr7({Tx_z6U()NkWIVM`4>TrkFlJCdm3JT1eft--mS-PdBR@unkfq!U
znlFDHaWeV##a3(b$63P}jY1*~XFUsHO9YO|L~tSXi1(4r^|4QZ6)dtpn`ka;T~(u4
zu#@fZTPn$lo?l0oS+hdA=VD>kme|Yksq^7!K4K)Q?;srMaaWF`LzygPbi-SyDrOrk
zMF!pD*>a@5!GX@ElonG8#G$QMlMm;UD50(|lm1U+-^*Wr#$|ku`t3TqqcTGq1<G(C
zU)l+jBVBoB9!boXqn6A1%(wHe`b!_07cN6!Urt^uU(VGcZDh)%<+SXk6Yd-MYz?rx
zkGAelt;<C-ecn#>_Pf20--g`Vy%_uIy#;kvyH)EWC`rsP3w@l=oJhw-Ed%-ZzBBMg
z!Gb~UCMzyzNODUnGIF=28_|SJ2GsnyHx|aKS4ckdOxtbN*KZrBG`X8Zf>ZRjKQ+I0
zGoS)UC)FK$5k1EqlH2V^Su#8D<qxbFLa{O50>F&YqPSRL52XSZ>qOLf+5Q1D0bRK@
zG$h2U5=W|}Uf8ZR`TJN-)-0E}X<iSt3Pd-2r)SM|PeiIUU7Jv;$&+u=kZX)KyIT-#
zQ=aKy?eEjkd`<f@@Mg;1^nJ=nXY>7W^e74S-|}@eVa-Q`j+4v2d9oE)sqh~1#DUf0
zR-Xe{<05C6??;_QMYqqT9y`?Ytk$30_6MD{s_2mB-%0mKX=!Pvld);siNuLM;knu4
zBewM9-Bdhz1Er*tjWl<&-F61KjdA9TtjYXwT=+`2^QdIn-A@41=Cw9ti84N{&Lu8C
zt@x8riB<w|hO)*|a#3%;A-&<(>x>jKo}Fi@RFsU@3XJjf*cuK)`d(&v3qsE6Ce#<}
z8ZlzReK2N)_YLToL@$Mg`R~3}Cwb$yoIDtE)J>>?!-s3&dCmY?Y6_l-3-H+5VLuGK
zlB)JV{3X`*cFIIiH?zj>hpUtEV0x*3oJI$E4J?QjXD;U<7LzI@9}}l~1L$-!^}_!t
z3ccbW+l!Xu06}tV9F`&MzGg}vJ+;mAoKL!c-sX5ha)iMS8>rspjpjy=K<2xy2DFez
zjyhlpf_!~cF7c(_i<-t*yqg4`VjpFnCV2<aLj@Jqe$|Iz91+oRia+pxv0t!Ho?r}V
zU~ta8@kIE@jY$3iWQlG2lg#Fb%$nu8p66<A0+gl7dVF42ZsJgI%mW&E<fs%APGgpA
zYjX>ot<8~CD+5FBtNoDDrv);OlT{MR%QD!9j}G4qrH1<l!^g$J|4z1op1V2!!nGG%
zD(NQ&Wr+8Garf@v=j`|E*4U;B$)o00XKaaw3k#zGE3CQ<DaAMgi_i1mkETY7H)Q+?
ztcI*wN*guB7vEV3w?gpZw_#gX9e(f2$TyE~4{oOoXUXNeuMCVJlsm$3axc|=d>1PB
z9wiMuo`Px^){y9QLe7#cm(VyP8Yo6O+0|F?g>r5<W?1mp^*b?Nbp}+teSOt+Uc<cr
z*|U>xEGB4uSrsv>DTrhJ`}3$Miu6x0`_$e>x4tD97j!%JZbQ#4c<i^waQ*kRB%4lw
z{hOoe0y3AoyRS;Vg^X+CP;>5Wjl$-)M^GII<K@SO8w+e5@~z#MZrLaQ*jZFx;z!d`
zl!fkg3TnW2)^vCoe6D-Kj~l{U+s^6SAj4hgMl{*$SGbGHN?@GMj+gQf&@X6cYA0$W
zLMEBG#+%>W+>1#8#396Q+b8`!F!a+HUt0zezX%8;YkUgH_bhGUyIQ<RSolPlJs7dK
zrt_+pU~Oz0`^xD13tmpKP46604m9af>{9FlAGA~`@_BG(DQw_K>g_5%M4*+U@uG|2
ztw3s`ibiUsU)^JGmmf1Dv$oJ6C+e@M{33sqbMA36-&NeJzkE)e^*8*K*;E%wo@Qmh
zOU}L$@g&tBI;k&=RU<nrb9;{<m;1E%S?$feT>P&;aasb*s4tGjVjm<$HYSjxeIc>e
zcoxOWEF}Y)QsR`XzSi<Q_jTt%0AIPzGDwmawc}z+j)dF({&w6yCp-Hg6MDZD3iL{u
z{Z3z*6#V6QFemka8|63rB7$x*<T{WU6D|DKnq`Ba)F^9#RTbxAebeDL6q(6*gpUB3
zBCzXgkPspukCT%+uIuo;{fr9MYv+Yaj{~>9QQF)xJ6DXzj}sQkv@T}iUwKXh%>AuP
z|N1OVG`s`oOAR;LIRY1dg_&=+gJ>wvFK-QwUtQVRub>&qtR?tsT`_xkW^f+zM}p6@
zC7`@B-j>#Lr5}b6Ko?DSI8yyctq%w*fw-Z@)43ZVkKd`u$*HH#$DAlVTYts4dUoF3
zjCE%t&`5-fH&$H=3b&f20FY8Pk^FSAJkewK4kZWE+B__tWRt4Fs5?Z*p+oH$x%GUE
z-=SqQrMS3wYATPDd61!=)c>01v#r?mTAmem@2f8dOgG-O>ox<q;d6R%)|K3#08sl1
zs-BP+cAPdSk%?=d=$Iu@^A-7eC{~>je&B-7cX4jvC^hIue@eEYxG#PF{A#HMyB!OS
zpnRjG4{zrF_;?xa*T(axsqB00<H$*t|I#`cX7DGPq%h(<Xvi|1+4bom2U+pP9BpZ0
z=7R7drH8*HeH!Qc@}2up!@*svusj(4J?LFfka=CEv)ovtO@!q9++9vl^5Q9^@2&sg
zTD8Vi6X*4d4A}ka`=k4tregngC)uX`EF|afgt%Gj#<Wofdmma#rjGI-0u1L=B@9<>
z$T6f<WG5bgx{TVhtLLTzYy^3Ec{4M!yT%p!@zGH`OG`r>V&xyE3lx=u8pck)2a>>n
zMgq4e><v4LOrUqQNBBt}x|<BvW{*Xz!+kl#3WFvn6Qb<Ylk|;k6P5VBq7`PkGORu-
z9cZU1dhUkhIyPyyGlkOz!v*~MdjFzZb}T}oPBSUdcFp-wb!_y|?s^xw?4<-F7>x7l
z0$+j_<Kksct3M6%0CQ$?$Nng8mDF8f$cVsvqe0G~jOa%HI2Y;dv3&o%5vTCV<BCo3
zs(=GdfWM#Hxc6m826#U_>M4Z+ZgTn90*8XQY$WO;_|=xr$y(0Q(P~pn3}*$i{1+Y^
zlA!MRQJrQ?G!GDC8aF@lHlusS+f6DwpqV_E=YIkapk9-LsWLxz#Ru9vqSyFwr#KvZ
zMhwXnNacS~Gz>C4Q{!ocyKGeKcHDfoE@gQmPT@_=#xT)}TMf^s%grIPqid?R?#JCE
z5Vmn_+$)@As<z~rm})FgCl`3ut7X7}r;m4mf1DltwwJN`r*p=vJ(6Mv`FIN3{0ja=
z{tr{_;<sKy<`|dU6E~;x*-XL^N6pS7N93o|vXzKmQ$q*WvDemhCcsdTrU=Rd*NRY5
zQ?Acbk4`9G30~m`#^Lsct&4sXEb97<^74%G_aLd5*-kCkSVU3ehM?Gk{dMuEj!ij3
z$y}62D(*Mn_%VQq50Z?$^-HhAtHhDgs?t(ROG}m%cny6|>VPVp=9DC#Ld*?{`K1g;
zzB~G))yn0%-v@THm1XqEPk)zSnw7dBWi(aY?P$UNkl80{btL(m6if4Dn`?*ZrpNw2
zf6@?xYVIVje;oG-z50Xpv~Q7rubu>bnYEQ}aR;{eB+}zIAT9D|NoKyaZQ0)EfT-7R
z_DOfzxuzMsO0t)o$hjS|PC~HB$i_u1Na+QYIk!s$c&IMd7*%u}MU)JBAPVx8z`0o8
znlHxU$KFc7FxA^w+Pc59EArvZ8A%5IP8uOAr0Qk|73@N!RWgyNw^m%(I}UcP2sc~K
z{i`;l0X;S_2%Nx4c*Jmp!ql2@h|nDvmjrrk%-T}^qomnkAI7g<r5nHQ7(^L=-Q*va
zoc`sA6{R32XlQsimP{CI*aZ)=Z*FUDZqCpDy@29zF)0Oiu++#<car0MbUUcv$L6rw
zJ~A?La1eeDsf8W`3<)bMtG<ZX#BB6WF9PzddlKS;@M?>eQhW!u21w_4{Bc06Pby~v
z2I3BaAd!Is>+?(DHx<HN*qz5jH@!zep4adonj(dzwUx;&n>O1iGyGlJTVnnRuRjX?
zPk5@|KuxZ8koo50$D-?+5Bhet?qg&9uow&68{dw+ood{pV(_o^vynYr+JL}7xcbGc
z1UB1huMiu$PT7Z4JbD<sQiT?(6emgtqxyBYR<MD#mVVqm;=~l0N6i;d*ersFxdGHp
zZp`mtqm+La_Sw(}S_h0b1!5!rD2gN?<yC*W(;=BwQGu->Oc$%h%A)-)rOmUcZrQ5q
zf2#xb@pN;$Du@;0vs-FNygI7XZhZgP`Q>`qdHnwNe7;gEQIiKQsM|<>+O{cv{O=F^
zKPd*ru&3{4$TXw<_xyH5FZ#Gz?}>fvEPVKyK$4GSB%BscV6qy@1y)<+w|~OB6+9^3
zMjCrsj*HEv&+gagJFr8vJE|fSho*)=0=^ekf=IhQN>KM^pLrv<q{5VCRo`ig5<j;c
zt?ezJ_G1q&CFr$u$+d%jebwljhK!`za$ghqQSP}<>Uh1jTqOE#n@RV=hn6xs58=Ie
zmbRBehIlw>T1k%v&ExaJm!Vbir>u=XF*EP_XLJ9`K;7XtU@^&#noxN29vTfb*@4#X
zeo0m&qnX4BH4VMl#l_pp8h=5&m!ny{=U~_od((I9bZ7ppWqZbZo{-%I+yS%R)DExv
z*&dd^;T|CI29yBG2dNyPvkUgnT&pt<xKinwpYCfNZLJ}fJn&)GN(u|k-?x8Ff(tS;
zxF4KGnbFbd(jRU%#q5sA$-B#u{#cM;_}j~r8QZ=9&5ynp_))wS3nAf0zKCAA<e^$0
z_ILq8$&|Q%D3=9+6)z^4&@sWDRHU;=u*mP*C7D}J74P!n!^cU9B>*NCX!%S38~5ce
zp>;_+q`DfOE7#3F!ufut7F7!l8y@9HWf%sb`fkg<_d1#2A{2?5yV6>M)#V>8qw{Vt
znNri@KN|IgeH~azL8HEpQ?Xjr*U_Abopt7C56q-D-QMU*^6)I_LFId}Z~fpcQEbnL
zgM4ItzMGz_J`wTh`J~>vxWWJ|B8*|*dHGo3#IU|08UO8f#O`xR7#Nut52eKI1Xx(*
zq<n&ul_4b-3c^-s5jIyGB$0ICqghET&RQsKy&N?zH(BkmKc7N&p|?F8siUS`sF5<P
znaG`_??4MoL;mgh*Zp?U;5cF(ZL%7C<HNHvJ>j44xm^$X>BGVRW(0;o&iSS$;h5BU
zw77(%gmEBEqc`DlTcxFzu~AmQK!e$b_dKz3gO=^QQ@g&lHXFs=daDyBf~`ot$E{Yo
z7_UImns}#W!M!0!<30;M<GF4tw~)i}Lwi?b<CVm5zlC-5G9@)6N^RBIP7V`;?_FV6
z6FO3tbxnwORqe(ZUrbN{q_#kqPtrw5wMe7Garjl$>RcGX?KtM6v-AFz2UTm;R?~9Q
za#fX1!8Z~mbqiI6;hZa|#$z+>fz@eKyVHF#CZEs(wB8`OJRqQF7Qdjb3rCOYad*5e
zcWJV+=J5R|cb~O{GX-WwnQ7<o?~A>=ololtCGH%Ymxw~mg%^A2-RiXWw%zf$c084_
z&GvlDoJA~`vo!`Uzk?zC9FipLGgkD7GkL4z9m#Y<j+Ht}U?(<rASg2fZIgT>_Qi`A
z=LT7~5{WE!_KTIWmqR}ojx&5ySM@HeyDRj-)w4wv$5~%W9^+MQ_GT)}PMz7C8ti$G
zgus=KT4s`)e^ji#dK<a#tMF&Q5ye=8mX-s_I$TbqnBOoCe_p()nhj^##w!`87rKQr
zLWUp(ETD4X@GKt*F6~y0@n7E3!i(6KS54yv2}w!+Q}^oR`!h;nDPV2u^`(ZB`Clbg
zYv;|IddG);dXKHRkGFVxh!t!~45tz@ai>fL%`I=w6ZVk|k$B%5z^2ELh1tgy{gi#d
zj_W@wWw0SJ*Xr9X?|)<?FCm32QD32dlnsYfXI}k2N<l~FqvjAKX+emWmume0P?9Hq
zc6o}r)H!MC5#9{2w7%Bk?GSl$Ff2pb3g0tl(v4XU^O%m9`ZP3QQDy$+e3=6D<O*xg
zxSz>$LlraPsy~b7<m4WscUoi&dAeBLdIU)F(Y#lv*puSw+1uq~&Tp)*d&WQ{6!`J4
zqdsrSF)+AuT)ed;sm;L8T)ZfILH#+C=+|F?`Tc9%@?3$=y%LgwZ-NT=i(}25jk2AO
zXJ%?O2R5%HL->kh>xIl+g|MSv0O)(hpPc)tAF0<(g3T3Xuv7iAafcAl-?Ke-!9;WA
z;*Fx3s%35H^bv+Wf5n9P+qq2rtaSK4JF(Rd=g6bu$JQ<OZj&X1=!aFpjpUboyuHlF
zcBdh#AdOF>ZAKW}s)eS_n@-y|yz0*r*EBz)@P;i?ll<IdcE6Ft+&iiGl9Q_8F;)#c
zjw_6EatRg{C70ax`R;#R*CBkJ3PrfN*0FeiRn<4mT_>3r;hFIO6Gj}xKzoT!%g1i?
zZ%q|ZV||3=Uc<+Tus%eKPq@IK_y1DRIa3Nu_UCIHY;B(bY=BBb2QVZ+Gjnsqk<ZlR
z4@m%H;U8VV&F)@3>vZ+)bEBc)>;Dfr`~NjG{$D#6|CI;--!B~L`}*pc`SgDVD#Qtg
zG-=*}D%7hY4Cd$N`f7k7)vyl%|5>kmd&kow4A<fLy|L!!(FV$*+k!_yy}*u@6wwl!
zFM+(wz0X7eODaJ9ji)XvR^rD3;@Dh94t<-8Zu%LbnSd)LzuNTg<YOORNbw;k%c6pC
z;B%JDO#psUe!O8WmLA7%CVfE2Lkk_^wbfFZ$%cA0)U>_oiAK2zoCgSioJzWd-nCj5
zjtpU&o3S8&d@0V4@W4!lIeY2Z)ZixEDqP#lUndCAbF2XROmBB(;mDxcNCb$ykaxEH
zjv~w&Cm?D3I_(GqBK!UPe*r-7t{ou?z%)WOu~h$#F%I7xa{)0(I`6jNH>0UfEn1k{
z&H6kDiHqZQ)j?&&sMqWZ6O*itN`@vdDS!xcFT;ZNV6%^Lh=?E&NlyBA;_V_Rkcs*F
z>y^1c^k^@!Q4*qs2^5J!J$)Pp5#Wkv)+5ytVS<u8fDZNA!;1?L2ghEyKY1CHf`w~4
z`yZSh3@a9B1sdhQQ3HErJ;2S645?xhJXJ?Aa>^)przR>2z#-(0l?sJvNF|?Nk#o7x
zfQaVuSKypuDA$Mp#>@fGk|9KQE*9L_+PkSrVq3!f8U*SwREUp5If$$wC((AOF-T{#
z`v#y?;vj%Ar>iUXD)LCjTDBQZR0J(1s#j_RsV#&&G#Nv`Ot7nq4QQY}^Y{4TKj-dY
zz&X_xNMT$)^w{T>kfZv3>4+MqVk@eD9(xEe0;q$Z3RlRE592|=yxA-%$rX;+6Z=J{
zC@^H;Eni%let`)X!!9rb?P6zFRTsA7@--B;?knO&@q?hvX>K&^>V>yor|{BS5LM7B
z31a_p`$W7x5isBM<A*S}-3hQcan}F${01Mjx2xodVr~##03seXgnA&$YW!jSV3ii*
zMeHR4OABL}&Cs1ec`CL%s0-kJ`|~ATrkC~g19s{}GCWxaG)aaK`>gX5zv!{i3n6QZ
z(Cw<r)ydT;>!xJ2nVUgbnOt4N&l6-;?W)*FA^y_V2_*@DT1h>Re@rgY|8&ng?vDAB
zr>`TgJ~TpVMEvpsFatpaQaoj8%Q_;_Z-wu>N8j3f*;51@0JR{XDmQ7)q`Ml)B?fBt
zm7sPA<hu;LNn(-NZ*4D7M7F^R&>%zeXPF4X;hVzr&3y+z20>WLpp2m2nMZ$jk>4#1
zEbK<1k^qg*+=N#l%yK4us;ZA7Q^p!6F0A}6pB*~K1-wkyfj<C9Bpi8~a#QOz2bWLt
z04QdY*YiNQ;&9HPPC)?=bX>m<f=x15_6=t1F3lq7jf&$y<WxbVVQBa`*h)A6t1X=7
zm~4yB{4eNtmXZ6&NW8VEL0k|BZ{SA86wt80JSG`5#Bzb`uyZkpjJr@f!}8y7=w3;3
zQT6+nRAA_v%_J8~-qF(_y`nH}V|b9ta7><e_oWZ8GN66Racbtf_tfmZJ*hN%{nQV2
zXZu9MFY}9B2R;xBOm{=QS>kufkQnh+2NT|IZ9?o<VQ6bo{;WX^Z%WfkyN{SoSOu!R
z)QU<&N-0jIp%mt&%B=x?FW|IrnyBJ}&Aw`-rNbUf3||Ym8i^*PFOQ#}hOWXM+b#Z|
zVy-hPs$^UD6BQ9e$tVb8gQA0gC_zMMhaj;5K|xX@h~y+u@~B93KtMoZi%M*0kenL_
z5s+*H4Kz7PkR&-Yyxq>c_3pa!=DoYt{qfF^rWbuqRh_C`-~RSjRlCLTkjYSc^_4oa
zpo~-DoQXspe`7D~J~qQ0fpD!~;LK@{Pn>E&>);RwC-e~nV){A%K?Gt|5fU*Vwvn)e
zgYZ@S9zIi0P{1arn|TL;_!ZCG#LtfujuQk4tMX|BBp)&UDM}?XKmXF|YKTC}5t&0!
zirs*~KNk}j01wwF%OMaqO!&_r5U)j|P9PAla!#!^>>p~K9dRVzCHPSa9yEVjr*}+D
zPBu0$@baXG2aBdh_`ABgT*>yOF)v?c!oLum8BaYL;5>|(*&73>Iau|y54v%DnH%M|
zJLJ7+Z)ex?`Lox^W7>kk!tQg>g6WPAD(3|R1Y~4<zfUyL(a{0qdwqM~@cHxSvGMU3
ziBhr<l8uc`l~Q*cC5FE8<I>Tet*OG~4KF&{+uz~180wje#(7fw2&B1g^Sh=PTv&={
zH73tG!A9KqId0c_Z<|D@*&GW)e#tW-SYmx8T${vNRie*PrX&eX&A>Cm8ejkNG&?){
zrCA4RZ#m*WXbw|*_8uK$&|AGcjE%S)em$Y1?BO`*dW*rz>gvyfzYSKFYGP$&yw_Qo
zh+<H6-KZ7@iHwPfu~GSW>A8?$$#Y&5#&~)Pdcx4q(95f$;@+(mTYH06a!1FA)77Y`
zsIEg{W*A#&Dh=Vc%<jhWetuT2AJ=u`Vw;#&3&9H)9$w|nBkXyps;X*eXediVhqKO%
zST=-)g`H+>czzPLvLu!`4%Zx_XAhX-hrY~oE+z%AUbt|<<!jp=upq>(tfzTYn{nm)
z=Mgqa$K&y7X^Oc9w`qq(k0B69tBr+0X^({*9F9xerfa(MZH(srHLum_J<1O8%NN*`
zOse!)>`41{u+qbkTrxk=6pg%W-uU9llLN(d&QvqHtMK5#BnXpio9n^zuv2%qhC;PJ
zd~8gNxR}_%a2;*!qeqYKU15lcizAUpT!E$uQ*csf*muKt&Uftn6EL}76{#njKF3+v
zlS}Le${r%MO-y*7|AKU$YKf<zWjq+ArKniXVdC>}Z+GjLr-!>qq%h?Xv^p^dk7um;
znEj&_AN>!<{kXT|F%9j@v}*5-A)l4j<~R|sz(=nO{P+FpE=`y*Ge_2D6+dhcLm=3f
zv6aT|l`>9?D=*H-+m)#8cb1ap%gf7o{WeW69X=F*(GLCXx9atNTY2`}5L<XtS3Ng3
zH`t_&%ggh)%3T+DthuSFuiD$=UXvKQ?h+Uh#&2s9&T10CdQ?<YzP2T=cvV9Uva+($
z(v?+J*H;JCiO)kq-uKyI^7IrK>Ka>G80_=;@i0~$#voAPLD9|4&CSZPhP|BL8w<&o
zAA}NTUWRcQrdz}LE84Nn9R0#{jpT&*c+X~`(w0}EZ)<AyK8CQQE?RH~4lizO`8*zv
zjg2iG@)9ii5Jl~mA8k(E(f3~e*3{a1OrP-j%^Nv|>pbnXk7>d}Lyd&>7Kf?@fBso+
zF_rP?kgyOElP9@kKp!@-a93ZyFO*HN>t_Vw2-cJ{&@>6<@Aom735<nd=o@!0pCRP_
zxPSj`s|DBfM@vGb15q(Ccl7m7_OtY&b$Dd1ps(=p@fDfLM5@an$DGTbG&qz>KVm>2
zYVN?%G<$pdkzTRZGxC8~liPN8ci{-+A@2jD>F?i*I&nV=z9=XtC?zHJ2;C@#K1*}R
z$fZ?0mSBDDPka+wC1MeX+S2XO<Njq9tqBzs*Xk|-ur!397DR@GgaDz$Wo0ezP##0(
zA@-|`Mw?PjBekggt@g6AvJSOhKJ0|rNhA(Bl=j)dAktT*7Arz36U48$PB3*is)m*I
z?VoWy?C_HDm6-LHz++q*Dc4W(@}gi?9GBgEotrC^&sw$)BK+dPL`Cz>rO`lBF$oD6
zJ4RkvpNYoER9w$Z0Pfe`bA5#w+B1Wd8!&87GGIVqXz2#`k75oU<(8C`6yy@${s}6+
zhZfLC>V1E|Aw|X;z>mh`z_TIb>3;n?ci6nL*{JZ`L){yakZ_90+TK1U?0B5-WOAp;
z*0<_&o7VxtFJ`8v-`pBeDBo=sdongQ20M_tvL(vS!NfY$-PLv8@ZM3P*!bAkLfLqD
z=UXM5?X@%1T7P+~6fsox@U78q*T;@DE{C!+!otD;w9xmfAy;Imjv_z(;rJ&wo7z0S
zd{8jZ^y8;b)JYY)KQS$+1aDZIU?d#ZJbCTm*heTAo&zA;S(}TCj+SRQO7&f}pD+Ko
z4j(%;@dx@nm|y<J`9J!b(23JThf+f!DsH7J!+yu?>^E<a#gK2S;|{W1TKDc<&P|&E
zgxg8*S<ycR457gO7q9h)ul$#t+~3_^X)rAZO9?^8d3C9PYJ4^dQhp6Mlo}iWFZnZ=
z^s^t=2c5c0!nxEI%Pz3MPk*}Vf4BNPWj_uXypeC6w|KXsx36VX6rMNA_>C|P;#c=G
zbc%UXvG=k5n1QaMcCq%IU3;@_g^8{i<JMLsJYm7Jm<xdj5Q(b15ADBs#7*Ck_YNzC
zvN*1R-P`b7)W_!Ot7Bgwk$(Q<ES>$^mH&abQK|4fK}%Ox*VEH;NeqQTncw1T6>2a@
zh>#lJ-pCi|vs;!hjF_Dj&(NS4E2u}R(RG$B(dLz2xC{Bu^!TDw8Yi8z&Qg57&rsal
z3=hONT>2<>R%Y-)0*1JXVSg$dfFz!f|48as^7Go>#~1)1nP+Yq6z%G^$K%LIq}eGm
z)qDH6)DMFLKI#e#It)xY>hcQeb*!QvFDa@M{a2@Psf$oke-J;b=&@sqi>Umi-$Txr
z(H}g+6gd@}%_E~N;t+JwVnh?UM{n9>XSGYN$AHoJ@uxOg*~@J8Ed(VbMx6Rok8l;G
zjYp<mFzb(E-WPm#(b>ANMOkBO@x^5X!YV^~NZ?&zv|D2J&=sUv!ebwone?CPPArF!
z$V-!PXtS#fg+dc<W>?_VTxvDt!oUc9mE|It9*JG7Jwbf=LX+uIeNl*YX#4P~`(Yc2
z7#$yu^+pBfz`bi{&IpLF`9GP5CkA^@hdsy4Yz21mIr)rczc4NA<9;liA@BaVO*+|k
z(v0EM_36|-3Vp~gU{S1iI>npAH9B*%rClTf$JO~xK5@SLa{GQn)UMS{LvquIMLe-@
z<J10zhD)>P;#T3qhm_OdLvhtW$I7nprv{%CclKPm!Zw~np{X~OBJ8aeN`3GSETH8e
z+r6h-?u)}5Mj);+s+^A+ONzU!RVz;N@gK<DCx6ivLTXr=RLWd%WrJp1jMD0QPmX40
z3CaR;)Io=z4W9v51(3Kk8E3+%t4{P;>(<NANT#9y@EaBZjesH;E`_*tfm9C6qftly
z9Yf$TRu5rC6WqI?kdQANPNF@|vr8V{lc3h|Q+5%GuG|RkxiWQDKp==6sKFpFWm@Uv
z$&>Dt6j*$adbBhW$)_X-mW_C$v=go6;D9$^mU5m@wy>}m>78+1Ul@dg*RutTR5sN4
z!9yL;Rhm0|#M;izPEAd%ar@B9R4YqlXlN)gLo=wy#CyJYW60ZadvyjV14FKdEqGm7
zZ2S0fCOvz7RZn_1&`gq;JYl<i>=mj1jpa~Jk8UyREmQtf{X&c6Yi_CvJf0JgfY{m1
zNF*}{hsVM|`G~cSPBd|bL7z}N?;fTM9u6)mO0=e|l+WhU%L_XF{lYs1xel$18;R-G
zX(2TyR}2z$R}#`^Hg?2M>b@e)zOlp*h8+uw{nfJTo<#<*i(tO3Y$@=IirzA=JTEAS
zD=4@uBBH6#20KA}d#_q`{8X&+uBysND1<Zcy?ImA@^}M|elV$F+}hSwO;c0vzZf#t
zsZ-pHO4?DvymRnw>@q8d{kU%7B3W*pR|A#;-J_;PKe(?w;Q2L~OTsP_k9V-Q_nj{$
z1s>;>y;qAUrh^f>7Vr?QENc4uK`b2j_c~U07g829M)F~OzBfsOTLD&hTxM0v)^^T~
zWW0ehp{V)-OyB6T@@MV`4+A?wBcD*O15`La;rn~<X|Rxd&cBH~{tlG?aywfS|7~4e
zivX=SfZ%KTJA|;kSwKA?nzWiLD=YGl76ft4m_PbO^Q5*K6~@m4EW+Q&e~hdCU9cAU
z|MeCB7`_PtLCVQ#Ig$#V0^xd@+Aw{6{mIG64@BTtFBD;RXmB*~PdJx%F~jh1gib7g
zFTP>^Z+F38M&^MYy>a8bJMJu0Da;DP1X)yW-+reH15%;N@6m9Sp1$?OECBBffiQDw
zkQ;pQFJ=78gEW_2^O(xZsW}LYJqE?$fd7pLKmwa%g$l_cRrc=g?$*{bX&eeHc^gLZ
zb>6x5Ys(L5yw*<TQTyo_fKT<(@APo~9w=vOJ@7cPLuKx}cl<@1&ulbzJ=;^1&_scB
z>rkoJ`z=XH4b=Gcp4+^p90cEzk{cU@eP652@n!yV=K^)#DA~^ke$pY&i_-bxDIYjY
zS~hmt!-8KS1G!~NE(^=L+}M|!gifjL@5cr>ySQ{m1viXK`R#62Zw!%pdtb%Gtku#S
zH@CFJp(B^JzDD$P)z}BOvhH_|`?HFQT;$}~Gtw`ANe!7Cpnw`}09G%rZJ2&6%*>E<
zVXgOm1O%h?4Gn_!TU%R5Z3sdQ4Y>^bH%EgY4pUWAqh;jgG$5P`I*#(2$|0{xhFwS>
zl~nvFCNhD&!F^d>=lROlb{<p>afaJ<Cg$|VuPa-Wy}-c3ZXezN5EdnD!0M(fHZM09
zZJd98s<>o2p7eF4&+gW0tuCjERZosSI0EKKB_2=nZ@uql3xb2y6T~h&GP-@6x_bq}
z=84J4XGOumWk$kGS(%yBfX4aL?v+a6-0|`8Gfr0lr}F*8+mgvH+b!(G)zDzh^kCEA
z3+WF>wL6~OFy};u^HfJg2OTo$eQaQ8h$eo^40*%*14g~7F^R3|u)(lOmqTWyq@*M}
zE%+!Ooe-v~_v=bXqvQZxk;meYtnc>fw>}*Ob>iv{#bis80(mvY*3IiKfoJ#>vDny9
zmYuD8(Coz-vry9nvnve5BLk=Tokt~rX_(gWXsW6n^|V%kqkeN$k`Zzc*KfCwdwQmv
zu8RGE-!?;5WnyfsH<?vcRBoy5n?k}LiA1oZkE2did)spmp?|J>V0=;555f^vCedSc
zM{e>miK0ar-q&Asta#{>RJdOcbD~3$Q{`{K7TEpQ6mN%5&rb%LLW&dD4^IxgfB$|~
zuu`i%<-VQLydOc*X-(k7(*qu~hs<6CKj*9o<^*F&0b`MH7|4uO(AL(TYD>oH@(IXL
zdYJZ$CmK8T+LN}<pFP`tl3ZDnI`Gw&jY;%fdok+Nbt^_u5QXfEQFLw`B%>9IEq=>y
zsi?$VUVgl!w--)X@64||4pX@qy9I^+-89kC3@gjh4FO|^Hmgj>arj&RF9=Ql2=~aX
zlmAB$U={c$+Ve+4VD#VEmFzJ3`!gUPw{bmxcyRb`ROXawP%J!EEHOA$?fgRvdEen3
z%QyFl%kj4VRhY(7pfTAT2~JX$Rm|HQT=dI`RZy51k3c~rG*s{)tYT@+fk0W$!1{T0
zW!{?oopfGUm<MDM5MjEW<t;aWYy>y^ZO>$MKLg>`$Y*Kf10Ycry$N6c)G?g?8^@vi
zgtMRFtJF`~Q^)4?LKTLqq6IYD+S+QqV^Kfud9^`yovktzr&Z{N&oE`Gf)<)|UKO?o
zMd+ykKQ1Lt&gl^}Fqjl@V>lB&ad9`OMsF{!x5jq|Xc0aiB`)6WP-<dY2&J2ftKDwn
z(sjAr3LjdlEnGc$tRXd?^X}d@HnVIJKYqNj;#a7VESWR)dO99Ht?GP1Dfm+k%v*>I
z0CevDB6+kFB`ByS<eQj}8FRfx{l~`F<E5w^UfG<fUo#UVb1b0TI-A&~9ck|&r(x6p
z7B#S!@0XH)?%0tz%Gq5kDrYZ0;hAUNjSOf|VZiWk-%4ON_Z^1lyz*&tAY?9;gW#KH
zpD2pv6<dot`uWu~PkMn6)cr#uBiauMl4Pjb2*UL+p3bczJNxIEwzy+;e+;j|F~JsF
zw~AKp$v;f&TR7b4`!}8}&RFO*i(P+_HN>(Ya7aj$yna2>W9WECI%g6B@Am5}EueP+
zhJv)_zA@;5JFR=$*w`0{C^}KX9%$lhLb2UYl_%ug0e}M-6+C}_bHRgvb9eRUiLjjc
zk_LR}GQ)VRp#$V<(um-Z={iSmZDsXnq5P{XWmSjiBpA)p<Fc=!qTWd8cV2-CK|O#H
z^^NnBy5X3<385w}?X09^DJT?F?J?+6*}1M6_r)Q~_hu21yZQI(L)q%<>*+a!LG#j0
zy!df$tu0x41MJucYOeBJf!Tf6#3WBbKYV!6e>N-Q%a<=*hnlx0qBT?e_qKqSgOG<>
zs-+2I7b=(_w71Ld<dEikS6U_Y@J611)g+8JdOkl^0PAyESl9u8OHdDg9cXQsNKK~u
z-A-yUrwG|?<y?c#sn$eE*a+l0^93C3>Kw^&{L^nXK|{)N$qOB+^1!EC8XHeUCM6{W
z1O)IYg=eDK3v5ZkO!1c=sH&^Cz%6kiCiuF2pBzke1&D<tV~^yDds0nJO@!S^5f>L1
zMRU-#04hPv(MhZ*@9F6Q(PcycwG~2~QCC+_0t}B8GVD5MTTWRgTec>!f#|@?>pz!U
z$|K`-dp<v@(6Svgu(lK#L*x86RMiKv5J4!x7<3<)v!bHnpN$RVa=7X|(U>Y9r0@<l
zJS}bj-8Hf=ctP3OIL>|5sVV9#JO{Qg4B5K68Jnpw2)sIEvBKi$bs?La8xbveD12j{
zOzQ8qsrJX>a5##e|Nb6@CIdR*I(Ql|1>wblmq3vD9x1xO`#^vlmuJ}VB~48Lo$xX~
zK1=v63>V6Ex&u@v%k~swEv@`}AD)0$#G$?Tz~n)ah8;yRsGYv{boKblFdlHDp*PLm
zx4uF^E^zT66mNrJl>z_v-C55<vpa)Envvo)I~v5Rp{%S$x;PJA4o+P}SXe_<Rhv{%
zJ~d?lnF9-rH|IqqBy<%F3=D*A3y_52{oR#z@9R9x!*eBoVbF_vBeW<jj-2Uc5yGlh
zHE3g)BCAfpyeilw#UX;pk(l{SI$av$=C+n3<#M1Quj>0}P}xfy1}k8V!ra`PTh?c@
zQ;8R?sYw_O<P!G3C7Irm2zC)2&BEyfLL$gddqAFK?0WN?F~sBxD|<7V{;(IV@C!=o
z*=5J}6d6UxgEYKf$GfLxWON<Wx8@2V(5Udaxvg_+{qycj);2cGOc}(ib$vA*9paNi
z^q`($y=E`i4)b92^z?LfOCBwo)m`cm7YvPvhzJRZzho-s<5Lx^4wD*eK^7}=?%YlL
zZZ?(7@wFME<<|Gf$Kw+d6Q`x!0pQO$VYMtQrj|zQpxdp}_MLuSS)U`d&%4v=p@~Jk
zd6Ew>pck)s(Iox~N^e+52xLzm26`mojrK@y5#+$0_6EPOvp(;wkGr!9%D7Sl4{$U2
zAV#)G_()4j)8Em?H;jX+m(QdpLUM~Nynb8wY3YQ7gahP0KLY5vn#tGJNHF)-NG5qD
z22WZoDnX8ZO0G<|?zfXbYmcxY>D;b^#<|UK3$~*Hc!AYJ7aN(tf=7D_mGiGVXvTZM
z{01(((`>T$y?W7iaitx#Km9fWuqoxo?;EX>!wftk21S`LAKA(JT2nF*v8stw7uod{
z+$bx3*W23*WyJ{U$*%X_hp5)EYWY*L$557pz8VngU?x7-U=ZtJuF#_T$3c$1V%cU?
zXwf<;r0W9HyL$N(7s<rmS5EHt*=`&xV6Y7AK6?>RQTt9M8e{{uUX%PjJ8+6Tk`77z
z=4W7@f(b+Sh*Ym?gU^BmpzG$1CyMC?d*B6OVP(2d04hk5us2W+V{1u~*@?GI*;<b9
z=VoDv#$r+TKQR>7$-#Y|p5=oM!$sCQubEj`>Yn!KUiSPxQBhHW-CNBFG=0u3^__#T
v`K&)xvQ7+L3|h!*T71O+;zRzrLbi<(A<G=Ol7s}M6oF7t)KI|7-Fxz%L)`(6

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index a9b56ed4..dcc862e8 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -264,7 +264,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     # dataset
     datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # optimizer
-    optimizer: str = None
+    optimizer: str = "radam"
     optimizer_params: dict = None
     # scheduler
     lr_scheduler: str = ""
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 573a5deb..3e963d0c 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -441,6 +441,26 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
         return [x.strip().split("|") for x in f.readlines()]
 
 
+def emotion(root_path, meta_file, ignored_speakers=None):
+    """Generic emotion dataset"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    with open(txt_file, "r", encoding="utf-8") as ttf:
+        for line in ttf:
+            if line.startswith("file_path"):
+                continue
+            cols = line.split(",")
+            wav_file = os.path.join(root_path, cols[0])
+            speaker_id = cols[1]
+            emotion_id = cols[2].replace("\n", "")
+            # ignore speakers
+            if isinstance(ignored_speakers, list):
+                if speaker_id in ignored_speakers:
+                    continue
+            items.append({"audio_file": wav_file, "speaker_name": speaker_id, "emotion_name": emotion_id})
+    return items
+
+
 def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]:  # pylint: disable=unused-argument
     """Normalizes the Baker meta data file to TTS format
 
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index c15a3abf..1a5da94a 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -9,7 +9,7 @@ import torch
 from coqpit import Coqpit
 
 from TTS.config import get_from_config_or_model_args_with_default, load_config
-from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model
+from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model
 from TTS.utils.audio import AudioProcessor
 
 
@@ -269,7 +269,7 @@ class SpeakerManager:
         """
         self.speaker_encoder_config = load_config(config_path)
         self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config)
-        self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda)
+        self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint(self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda)
         self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
 
     def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list:
diff --git a/tests/aux_tests/test_speaker_encoder.py b/tests/aux_tests/test_speaker_encoder.py
index 97b3b92f..f2875cc1 100644
--- a/tests/aux_tests/test_speaker_encoder.py
+++ b/tests/aux_tests/test_speaker_encoder.py
@@ -3,9 +3,9 @@ import unittest
 import torch as T
 
 from tests import get_tests_input_path
-from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
-from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
-from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
+from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
+from TTS.encoder.models.lstm import LSTMSpeakerEncoder
+from TTS.encoder.models.resnet import ResNetSpeakerEncoder
 
 file_path = get_tests_input_path()
 
diff --git a/tests/aux_tests/test_speaker_encoder_train.py b/tests/aux_tests/test_speaker_encoder_train.py
index 7901fe5a..d9d6d71e 100644
--- a/tests/aux_tests/test_speaker_encoder_train.py
+++ b/tests/aux_tests/test_speaker_encoder_train.py
@@ -4,14 +4,14 @@ import shutil
 
 from tests import get_device_id, get_tests_output_path, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
-from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig
+from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
 
 
 def run_test_train():
     command = (
         f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
         f"--coqpit.output_path {output_path} "
-        "--coqpit.datasets.0.name ljspeech "
+        "--coqpit.datasets.0.name ljspeech_test "
         "--coqpit.datasets.0.meta_file_train metadata.csv "
         "--coqpit.datasets.0.meta_file_val metadata.csv "
         "--coqpit.datasets.0.path tests/data/ljspeech "
@@ -24,17 +24,21 @@ output_path = os.path.join(get_tests_output_path(), "train_outputs")
 
 config = SpeakerEncoderConfig(
     batch_size=4,
-    num_speakers_in_batch=1,
-    num_utters_per_speaker=10,
-    num_loader_workers=0,
-    max_train_step=2,
+    num_classes_in_batch=4,
+    num_utter_per_class=2,
+    eval_num_classes_in_batch=4,
+    eval_num_utter_per_class=2,
+    num_loader_workers=1,
+    epochs=1,
     print_step=1,
-    save_step=1,
+    save_step=2,
     print_eval=True,
+    run_eval=True,
     audio=BaseAudioConfig(num_mels=80),
 )
 config.audio.do_trim_silence = True
 config.audio.trim_db = 60
+config.loss = "ge2e"
 config.save_json(config_path)
 
 print(config)
@@ -69,14 +73,14 @@ run_cli(command_train)
 shutil.rmtree(continue_path)
 
 # test model with ge2e loss function
-config.loss = "ge2e"
-config.save_json(config_path)
-run_test_train()
+# config.loss = "ge2e"
+# config.save_json(config_path)
+# run_test_train()
 
 # test model with angleproto loss function
-config.loss = "angleproto"
-config.save_json(config_path)
-run_test_train()
+# config.loss = "angleproto"
+# config.save_json(config_path)
+# run_test_train()
 
 # test model with softmaxproto loss function
 config.loss = "softmaxproto"
diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py
index fff49b13..5fafb56a 100644
--- a/tests/aux_tests/test_speaker_manager.py
+++ b/tests/aux_tests/test_speaker_manager.py
@@ -6,8 +6,8 @@ import torch
 
 from tests import get_tests_input_path
 from TTS.config import load_config
-from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model
-from TTS.speaker_encoder.utils.io import save_checkpoint
+from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model
+from TTS.encoder.utils.io import save_checkpoint
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
 
diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py
index 12152fb8..c888c629 100644
--- a/tests/data_tests/test_samplers.py
+++ b/tests/data_tests/test_samplers.py
@@ -8,6 +8,7 @@ from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.languages import get_language_balancer_weights
 from TTS.tts.utils.speakers import get_speaker_balancer_weights
+from TTS.encoder.utils.samplers import PerfectBatchSampler
 
 # Fixing random state to avoid random fails
 torch.manual_seed(0)
@@ -82,3 +83,51 @@ class TestSamplers(unittest.TestCase):
                 spk2 += 1
 
         assert is_balanced(spk1, spk2), "Speaker Weighted sampler is supposed to be balanced"
+
+    def test_perfect_sampler(self):  # pylint: disable=no-self-use
+        classes = set()
+        for item in train_samples:
+            classes.add(item["speaker_name"])
+
+        sampler = PerfectBatchSampler(
+            train_samples,
+            classes,
+            batch_size=2 * 3, # total batch size
+            num_classes_in_batch=2,
+            label_key="speaker_name",
+            shuffle=False,
+            drop_last=True)
+        batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
+        for batch in batchs:
+            spk1, spk2 = 0, 0
+            # for in each batch
+            for index in batch:
+                if train_samples[index]["speaker_name"] == "ljspeech-0":
+                    spk1 += 1
+                else:
+                    spk2 += 1
+            assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced"
+
+    def test_perfect_sampler_shuffle(self):  # pylint: disable=no-self-use
+        classes = set()
+        for item in train_samples:
+            classes.add(item["speaker_name"])
+
+        sampler = PerfectBatchSampler(
+            train_samples,
+            classes,
+            batch_size=2 * 3, # total batch size
+            num_classes_in_batch=2,
+            label_key="speaker_name",
+            shuffle=True,
+            drop_last=False)
+        batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
+        for batch in batchs:
+            spk1, spk2 = 0, 0
+            # for in each batch
+            for index in batch:
+                if train_samples[index]["speaker_name"] == "ljspeech-0":
+                    spk1 += 1
+                else:
+                    spk2 += 1
+            assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced"
diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json
index 6dd86057..64b09828 100644
--- a/tests/inputs/test_glow_tts.json
+++ b/tests/inputs/test_glow_tts.json
@@ -66,8 +66,8 @@
     "use_mas": false,       // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
 
     // TRAINING
-    "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "eval_batch_size":1,
+    "batch_size": 8,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size": 8,
     "r": 1,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
     "loss_masking": true,   // enable / disable loss masking against the sequence padding.
     "data_dep_init_iter": 1,
diff --git a/tests/inputs/test_speaker_encoder_config.json b/tests/inputs/test_speaker_encoder_config.json
index 09a2f6a4..bfcc17ab 100644
--- a/tests/inputs/test_speaker_encoder_config.json
+++ b/tests/inputs/test_speaker_encoder_config.json
@@ -36,8 +36,8 @@
     "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
     "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
     "steps_plot_stats": 10, // number of steps to plot embeddings.
-    "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "num_utters_per_speaker": 10,  //
+    "num_classes_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "num_utter_per_class": 10,  //
     "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
     "wd": 0.000001, // Weight decay weight.
     "checkpoint": true, // If true, it saves checkpoints per "save_step"
diff --git a/tests/inputs/test_tacotron2_config.json b/tests/inputs/test_tacotron2_config.json
index 6c82891d..69b23560 100644
--- a/tests/inputs/test_tacotron2_config.json
+++ b/tests/inputs/test_tacotron2_config.json
@@ -61,8 +61,8 @@
     "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
 
     // TRAINING
-    "batch_size": 1,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "eval_batch_size":1,
+    "batch_size": 8,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size": 8,
     "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
     "gradual_training": [[0, 7, 4], [1, 5, 2]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
     "loss_masking": true,         // enable / disable loss masking against the sequence padding.
diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json
index b60ed35e..90e07fc7 100644
--- a/tests/inputs/test_tacotron_config.json
+++ b/tests/inputs/test_tacotron_config.json
@@ -61,8 +61,8 @@
     "reinit_layers": [],    // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
 
     // TRAINING
-    "batch_size": 1,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "eval_batch_size":1,
+    "batch_size": 8,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "eval_batch_size": 8,
     "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
     "gradual_training": [[0, 7, 4], [1, 5, 2]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
     "loss_masking": true,         // enable / disable loss masking against the sequence padding.
diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
index 384234e5..81d2ebbd 100644
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@@ -7,7 +7,7 @@ from trainer.logging.tensorboard_logger import TensorboardLogger
 
 from tests import assertHasAttr, assertHasNotAttr, get_tests_data_path, get_tests_input_path, get_tests_output_path
 from TTS.config import load_config
-from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model
+from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.models.vits import Vits, VitsArgs, amp_to_db, db_to_amp, load_audio, spec_to_mel, wav_to_mel, wav_to_spec
 from TTS.tts.utils.speakers import SpeakerManager

From 24b57f6a0e45b2b6b502f8bb679ff506494c2e47 Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Wed, 16 Mar 2022 11:51:37 +0100
Subject: [PATCH 12/38] Fix typo workflow text (#1403)

---
 .github/workflows/text_tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/text_tests.yml b/.github/workflows/text_tests.yml
index e06a25ad..66197e0b 100644
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@@ -1,4 +1,4 @@
-name: tts-tests
+name: text-tests
 
 on:
   push:

From f40b833659fa7ab7b99dfdfe54314674edc949c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 16 Mar 2022 12:05:17 +0100
Subject: [PATCH 13/38] Add CITATION.cff (#1404)

---
 CITATION.cff | 20 ++++++++++++++++++++
 MANIFEST.in  |  1 +
 2 files changed, 21 insertions(+)
 create mode 100644 CITATION.cff

diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..6b0c8f19
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,20 @@
+cff-version: 1.2.0
+message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
+title: "Coqui TTS"
+abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
+date-released: 2021-01-01
+authors:
+  - family-names: "Eren"
+    given-names: "Gölge"
+  - name: "The Coqui TTS Team"
+version: 1.4
+doi: 10.5281/zenodo.6334862
+license: "MPL-2.0"
+url: "https://www.coqui.ai"
+repository-code: "https://github.com/coqui-ai/TTS"
+keywords:
+  - machine learning
+  - deep learning
+  - artificial intelligence
+  - text to speech
+  - TTS
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
index 0d8b4b4c..82ecadcb 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,7 @@
 include README.md
 include LICENSE.txt
 include requirements.*.txt
+include *.cff
 include requirements.txt
 include TTS/VERSION
 recursive-include TTS *.json

From 690c96ed28fabafe587d0da63f1fbb5037a27083 Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Wed, 16 Mar 2022 12:13:22 +0100
Subject: [PATCH 14/38] Fix default phonemizer for ja and zh (#1399)

---
 TTS/tts/utils/text/phonemizers/__init__.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py
index 5dc117c4..90a526a7 100644
--- a/TTS/tts/utils/text/phonemizers/__init__.py
+++ b/TTS/tts/utils/text/phonemizers/__init__.py
@@ -12,16 +12,9 @@ GRUUT_LANGS = list(Gruut.supported_languages())
 
 
 # Dict setting default phonemizers for each language
-DEF_LANG_TO_PHONEMIZER = {
-    "ja-jp": JA_JP_Phonemizer.name(),
-    "zh-cn": ZH_CN_Phonemizer.name(),
-}
-
-
 # Add Gruut languages
 _ = [Gruut.name()] * len(GRUUT_LANGS)
-_new_dict = dict(list(zip(GRUUT_LANGS, _)))
-DEF_LANG_TO_PHONEMIZER.update(_new_dict)
+DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
 
 
 # Add ESpeak languages and override any existing ones
@@ -29,8 +22,10 @@ _ = [ESpeak.name()] * len(ESPEAK_LANGS)
 _new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
 DEF_LANG_TO_PHONEMIZER.update(_new_dict)
 
+# Force default for some languages
 DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
-
+DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
+DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
 
 def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
     """Initiate a phonemizer by name

From 0870a4faa2fd36e95aafb9fcdf6b31d43b6fa6d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 16 Mar 2022 12:13:55 +0100
Subject: [PATCH 15/38] Make style (#1405)

---
 TTS/bin/distribute.py                         |  2 +-
 TTS/bin/eval_encoder.py                       |  9 +--
 TTS/bin/synthesize.py                         | 17 ++++-
 TTS/bin/train_encoder.py                      | 57 ++++++++++-----
 TTS/encoder/configs/base_encoder_config.py    |  5 +-
 TTS/encoder/dataset.py                        |  8 +--
 TTS/encoder/losses.py                         |  1 +
 TTS/encoder/models/base_encoder.py            | 69 +++++++++++--------
 TTS/encoder/models/resnet.py                  |  1 +
 TTS/encoder/utils/samplers.py                 | 20 ++++--
 TTS/tts/models/base_tts.py                    |  6 +-
 TTS/tts/models/vits.py                        | 13 +++-
 TTS/tts/utils/speakers.py                     |  4 +-
 TTS/tts/utils/synthesis.py                    |  8 +--
 TTS/utils/synthesizer.py                      | 42 ++++++-----
 recipes/ljspeech/hifigan/train_hifigan.py     |  7 +-
 .../train_multiband_melgan.py                 |  7 +-
 .../tacotron2-DDC/train_tacotron_ddc.py       |  7 +-
 recipes/ljspeech/univnet/train.py             |  7 +-
 .../multilingual/vits_tts/train_vits_tts.py   | 12 +---
 tests/data_tests/test_samplers.py             | 21 +++---
 21 files changed, 184 insertions(+), 139 deletions(-)

diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py
index 97e2f0e3..b5552e32 100644
--- a/TTS/bin/distribute.py
+++ b/TTS/bin/distribute.py
@@ -35,7 +35,7 @@ def main():
     command += unargs
     command.append("")
 
-    # run processes
+    # run a processes per GPU
     processes = []
     for i in range(num_gpus):
         my_env = os.environ.copy()
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
index a03bfd82..de9e5865 100644
--- a/TTS/bin/eval_encoder.py
+++ b/TTS/bin/eval_encoder.py
@@ -1,17 +1,18 @@
 import argparse
-import torch
 from argparse import RawTextHelpFormatter
 
+import torch
 from tqdm import tqdm
 
 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.speakers import SpeakerManager
 
+
 def compute_encoder_accuracy(dataset_items, encoder_manager):
 
     class_name_key = encoder_manager.speaker_encoder_config.class_name_key
-    map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, 'map_classid_to_classname', None)
+    map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, "map_classid_to_classname", None)
 
     class_acc_dict = {}
 
@@ -43,11 +44,11 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
 
     acc_avg = 0
     for key, values in class_acc_dict.items():
-        acc = sum(values)/len(values)
+        acc = sum(values) / len(values)
         print("Class", key, "Accuracy:", acc)
         acc_avg += acc
 
-    print("Average Accuracy:", acc_avg/len(class_acc_dict))
+    print("Average Accuracy:", acc_avg / len(class_acc_dict))
 
 
 if __name__ == "__main__":
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index fe31c510..8b3f53db 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -210,7 +210,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
     args = parser.parse_args()
 
     # print the description if either text or list_models is not set
-    if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav:
+    if (
+        not args.text
+        and not args.list_models
+        and not args.list_speaker_idxs
+        and not args.list_language_idxs
+        and not args.reference_wav
+    ):
         parser.parse_args(["-h"])
 
     # load model manager
@@ -296,7 +302,14 @@ If you don't specify any models, then it uses LJSpeech based English model.
         print(" > Text: {}".format(args.text))
 
     # kick it
-    wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx)
+    wav = synthesizer.tts(
+        args.text,
+        args.speaker_idx,
+        args.language_idx,
+        args.speaker_wav,
+        reference_wav=args.reference_wav,
+        reference_speaker_name=args.reference_speaker_idx,
+    )
 
     # save the results
     print(" > Saving output to {}".format(args.out_path))
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index af3e6ec4..b8d38bac 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -9,6 +9,7 @@ import traceback
 import torch
 from torch.utils.data import DataLoader
 from trainer.torch import NoamLR
+from trainer.trainer_utils import get_optimizer
 
 from TTS.encoder.dataset import EncoderDataset
 from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_speaker_encoder_model
@@ -19,7 +20,6 @@ from TTS.tts.datasets import load_tts_samples
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
 from TTS.utils.io import copy_model_files
-from trainer.trainer_utils import get_optimizer
 from TTS.utils.training import check_update
 
 torch.backends.cudnn.enabled = True
@@ -52,16 +52,21 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
     sampler = PerfectBatchSampler(
         dataset.items,
         classes,
-        batch_size=num_classes_in_batch*num_utter_per_class, # total batch size
+        batch_size=num_classes_in_batch * num_utter_per_class,  # total batch size
         num_classes_in_batch=num_classes_in_batch,
         num_gpus=1,
         shuffle=not is_val,
-        drop_last=True)
+        drop_last=True,
+    )
 
     if len(classes) < num_classes_in_batch:
         if is_val:
-            raise RuntimeError(f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !")
-        raise RuntimeError(f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !")
+            raise RuntimeError(
+                f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
+            )
+        raise RuntimeError(
+            f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
+        )
 
     # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
     if is_val:
@@ -76,6 +81,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
 
     return loader, classes, dataset.get_map_classid_to_classname()
 
+
 def evaluation(model, criterion, data_loader, global_step):
     eval_loss = 0
     for _, data in enumerate(data_loader):
@@ -84,8 +90,12 @@ def evaluation(model, criterion, data_loader, global_step):
             inputs, labels = data
 
             # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
-            labels = torch.transpose(labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1).reshape(labels.shape)
-            inputs = torch.transpose(inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+            labels = torch.transpose(
+                labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
+            ).reshape(labels.shape)
+            inputs = torch.transpose(
+                inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
+            ).reshape(inputs.shape)
 
             # dispatch data to GPU
             if use_cuda:
@@ -96,20 +106,23 @@ def evaluation(model, criterion, data_loader, global_step):
             outputs = model(inputs)
 
             # loss computation
-            loss = criterion(outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels)
+            loss = criterion(
+                outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
+            )
 
             eval_loss += loss.item()
 
-    eval_avg_loss = eval_loss/len(data_loader)
+    eval_avg_loss = eval_loss / len(data_loader)
     # save stats
     dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
     # plot the last batch in the evaluation
     figures = {
-            "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+        "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
     }
     dashboard_logger.eval_figures(global_step, figures)
     return eval_avg_loss
 
+
 def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
     model.train()
     best_loss = float("inf")
@@ -124,8 +137,12 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
             # setup input data
             inputs, labels = data
             # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
-            labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
-            inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+            labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
+                labels.shape
+            )
+            inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
+                inputs.shape
+            )
             # ToDo: move it to a unit test
             # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
             # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
@@ -157,7 +174,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
             outputs = model(inputs)
 
             # loss computation
-            loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels)
+            loss = criterion(
+                outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
+            )
             loss.backward()
             grad_norm, _ = check_update(model, c.grad_clip)
             optimizer.step()
@@ -211,7 +230,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
         print(
             ">>> Epoch:{}  AvgLoss: {:.5f} GradNorm:{:.5f}  "
             "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
-                epoch, tot_loss/len(data_loader), grad_norm, epoch_time, avg_loader_time
+                epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
             ),
             flush=True,
         )
@@ -222,10 +241,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
             print("\n\n")
             print("--> EVAL PERFORMANCE")
             print(
-            "   | > Epoch:{}  AvgLoss: {:.5f} ".format(
-                epoch, eval_loss
-            ),
-            flush=True,
+                "   | > Epoch:{}  AvgLoss: {:.5f} ".format(epoch, eval_loss),
+                flush=True,
             )
             # save the best checkpoint
             best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
@@ -262,7 +279,9 @@ def main(args):  # pylint: disable=redefined-outer-name
         copy_model_files(c, OUT_PATH)
 
     if args.restore_path:
-        criterion, args.restore_step = model.load_checkpoint(c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion)
+        criterion, args.restore_step = model.load_checkpoint(
+            c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
+        )
         print(" > Model restored from step %d" % args.restore_step, flush=True)
     else:
         args.restore_step = 0
diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py
index 02b88d66..ebbaa045 100644
--- a/TTS/encoder/configs/base_encoder_config.py
+++ b/TTS/encoder/configs/base_encoder_config.py
@@ -33,10 +33,7 @@ class BaseEncoderConfig(BaseTrainingConfig):
     grad_clip: float = 3.0
     lr: float = 0.0001
     optimizer: str = "radam"
-    optimizer_params: Dict = field(default_factory=lambda: {
-        "betas": [0.9, 0.999],
-        "weight_decay": 0
-    })
+    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
     lr_decay: bool = False
     warmup_steps: int = 4000
 
diff --git a/TTS/encoder/dataset.py b/TTS/encoder/dataset.py
index a4db021b..582b1fe9 100644
--- a/TTS/encoder/dataset.py
+++ b/TTS/encoder/dataset.py
@@ -5,6 +5,7 @@ from torch.utils.data import Dataset
 
 from TTS.encoder.utils.generic_utils import AugmentWAV
 
+
 class EncoderDataset(Dataset):
     def __init__(
         self,
@@ -57,7 +58,6 @@ class EncoderDataset(Dataset):
             print(f" | > Num Classes: {len(self.classes)}")
             print(f" | > Classes: {self.classes}")
 
-
     def load_wav(self, filename):
         audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
         return audio
@@ -75,9 +75,7 @@ class EncoderDataset(Dataset):
                 ]
 
         # skip classes with number of samples >= self.num_utter_per_class
-        class_to_utters = {
-            k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class
-        }
+        class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
 
         classes = list(class_to_utters.keys())
         classes.sort()
@@ -105,11 +103,11 @@ class EncoderDataset(Dataset):
 
     def get_class_list(self):
         return self.classes
+
     def set_classes(self, classes):
         self.classes = classes
         self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
 
-
     def get_map_classid_to_classname(self):
         return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
 
diff --git a/TTS/encoder/losses.py b/TTS/encoder/losses.py
index de65d8d6..5b5aa0fc 100644
--- a/TTS/encoder/losses.py
+++ b/TTS/encoder/losses.py
@@ -195,6 +195,7 @@ class SoftmaxLoss(nn.Module):
         class_id = torch.argmax(activations)
         return class_id
 
+
 class SoftmaxAngleProtoLoss(nn.Module):
     """
     Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py
index c35c636d..ac7d7dd5 100644
--- a/TTS/encoder/models/base_encoder.py
+++ b/TTS/encoder/models/base_encoder.py
@@ -1,12 +1,13 @@
+import numpy as np
 import torch
 import torchaudio
-import numpy as np
+from coqpit import Coqpit
 from torch import nn
 
-from TTS.utils.io import load_fsspec
 from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
 from TTS.utils.generic_utils import set_init_dict
-from coqpit import Coqpit
+from TTS.utils.io import load_fsspec
+
 
 class PreEmphasis(nn.Module):
     def __init__(self, coefficient=0.97):
@@ -20,6 +21,7 @@ class PreEmphasis(nn.Module):
         x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
         return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
 
+
 class BaseEncoder(nn.Module):
     """Base `encoder` class. Every new `encoder` model must inherit this.
 
@@ -32,31 +34,31 @@ class BaseEncoder(nn.Module):
 
     def get_torch_mel_spectrogram_class(self, audio_config):
         return torch.nn.Sequential(
-                PreEmphasis(audio_config["preemphasis"]),
-                # TorchSTFT(
-                #     n_fft=audio_config["fft_size"],
-                #     hop_length=audio_config["hop_length"],
-                #     win_length=audio_config["win_length"],
-                #     sample_rate=audio_config["sample_rate"],
-                #     window="hamming_window",
-                #     mel_fmin=0.0,
-                #     mel_fmax=None,
-                #     use_htk=True,
-                #     do_amp_to_db=False,
-                #     n_mels=audio_config["num_mels"],
-                #     power=2.0,
-                #     use_mel=True,
-                #     mel_norm=None,
-                # )
-                torchaudio.transforms.MelSpectrogram(
-                    sample_rate=audio_config["sample_rate"],
-                    n_fft=audio_config["fft_size"],
-                    win_length=audio_config["win_length"],
-                    hop_length=audio_config["hop_length"],
-                    window_fn=torch.hamming_window,
-                    n_mels=audio_config["num_mels"],
-                )
-            )
+            PreEmphasis(audio_config["preemphasis"]),
+            # TorchSTFT(
+            #     n_fft=audio_config["fft_size"],
+            #     hop_length=audio_config["hop_length"],
+            #     win_length=audio_config["win_length"],
+            #     sample_rate=audio_config["sample_rate"],
+            #     window="hamming_window",
+            #     mel_fmin=0.0,
+            #     mel_fmax=None,
+            #     use_htk=True,
+            #     do_amp_to_db=False,
+            #     n_mels=audio_config["num_mels"],
+            #     power=2.0,
+            #     use_mel=True,
+            #     mel_norm=None,
+            # )
+            torchaudio.transforms.MelSpectrogram(
+                sample_rate=audio_config["sample_rate"],
+                n_fft=audio_config["fft_size"],
+                win_length=audio_config["win_length"],
+                hop_length=audio_config["hop_length"],
+                window_fn=torch.hamming_window,
+                n_mels=audio_config["num_mels"],
+            ),
+        )
 
     @torch.no_grad()
     def inference(self, x, l2_norm=True):
@@ -104,7 +106,9 @@ class BaseEncoder(nn.Module):
             raise Exception("The %s  not is a loss supported" % c.loss)
         return criterion
 
-    def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None):
+    def load_checkpoint(
+        self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
+    ):
         state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
         try:
             self.load_state_dict(state["model"])
@@ -127,7 +131,12 @@ class BaseEncoder(nn.Module):
                 print(" > Criterion load ignored because of:", error)
 
         # instance and load the criterion for the encoder classifier in inference time
-        if eval and criterion is None and "criterion" in state and getattr(config, 'map_classid_to_classname', None) is not None:
+        if (
+            eval
+            and criterion is None
+            and "criterion" in state
+            and getattr(config, "map_classid_to_classname", None) is not None
+        ):
             criterion = self.get_criterion(config, len(config.map_classid_to_classname))
             criterion.load_state_dict(state["criterion"])
 
diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py
index c4ba9537..84e9967f 100644
--- a/TTS/encoder/models/resnet.py
+++ b/TTS/encoder/models/resnet.py
@@ -4,6 +4,7 @@ from torch import nn
 # from TTS.utils.audio import TorchSTFT
 from TTS.encoder.models.base_encoder import BaseEncoder
 
+
 class SELayer(nn.Module):
     def __init__(self, channel, reduction=8):
         super(SELayer, self).__init__()
diff --git a/TTS/encoder/utils/samplers.py b/TTS/encoder/utils/samplers.py
index 947f5da0..08256b34 100644
--- a/TTS/encoder/utils/samplers.py
+++ b/TTS/encoder/utils/samplers.py
@@ -1,4 +1,5 @@
 import random
+
 from torch.utils.data.sampler import Sampler, SubsetRandomSampler
 
 
@@ -34,10 +35,21 @@ class PerfectBatchSampler(Sampler):
         drop_last (bool): if True, drops last incomplete batch.
     """
 
-    def __init__(self, dataset_items, classes, batch_size, num_classes_in_batch, num_gpus=1, shuffle=True, drop_last=False, label_key="class_name"):
+    def __init__(
+        self,
+        dataset_items,
+        classes,
+        batch_size,
+        num_classes_in_batch,
+        num_gpus=1,
+        shuffle=True,
+        drop_last=False,
+        label_key="class_name",
+    ):
         super().__init__(dataset_items)
-        assert batch_size % (num_classes_in_batch * num_gpus) == 0, (
-            'Batch size must be divisible by number of classes times the number of data parallel devices (if enabled).')
+        assert (
+            batch_size % (num_classes_in_batch * num_gpus) == 0
+        ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
 
         label_indices = {}
         for idx, item in enumerate(dataset_items):
@@ -93,7 +105,7 @@ class PerfectBatchSampler(Sampler):
                 if groups % self._dp_devices == 0:
                     yield batch
                 else:
-                    batch = batch[:(groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
+                    batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
                     if len(batch) > 0:
                         yield batch
 
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 222f8519..945c031f 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -7,15 +7,15 @@ import torch.distributed as dist
 from coqpit import Coqpit
 from torch import nn
 from torch.utils.data import DataLoader
+from torch.utils.data.sampler import WeightedRandomSampler
 from trainer.torch import DistributedSampler, DistributedSamplerWrapper
 
 from TTS.model import BaseTrainerModel
 from TTS.tts.datasets.dataset import TTSDataset
 from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
-from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_balancer_weights
+from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
-from torch.utils.data.sampler import WeightedRandomSampler
 
 # pylint: skip-file
 
@@ -258,7 +258,7 @@ class BaseTTS(BaseTrainerModel):
         # sampler for DDP
         if sampler is None:
             sampler = DistributedSampler(dataset) if num_gpus > 1 else None
-        else: # If a sampler is already defined use this sampler and DDP sampler together
+        else:  # If a sampler is already defined use this sampler and DDP sampler together
             sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler
 
         return sampler
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 818b9a54..afadbadd 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -994,8 +994,11 @@ class Vits(BaseTTS):
 
         outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p}
         return outputs
+
     @torch.no_grad()
-    def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None):
+    def inference_voice_conversion(
+        self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None
+    ):
         """Inference for voice conversion
 
         Args:
@@ -1006,7 +1009,13 @@ class Vits(BaseTTS):
             reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]`
         """
         # compute spectrograms
-        y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2)
+        y = wav_to_spec(
+            reference_wav,
+            self.config.audio.fft_size,
+            self.config.audio.hop_length,
+            self.config.audio.win_length,
+            center=False,
+        ).transpose(1, 2)
         y_lengths = torch.tensor([y.size(-1)]).to(y.device)
         speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
         speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 1a5da94a..0227412d 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -269,7 +269,9 @@ class SpeakerManager:
         """
         self.speaker_encoder_config = load_config(config_path)
         self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config)
-        self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint(self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda)
+        self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint(
+            self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda
+        )
         self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
 
     def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list:
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 582fb4f1..f9e13251 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -206,6 +206,7 @@ def synthesis(
     }
     return return_dict
 
+
 def transfer_voice(
     model,
     CONFIG,
@@ -269,12 +270,7 @@ def transfer_voice(
         _func = model.module.inference_voice_conversion
     else:
         _func = model.inference_voice_conversion
-    model_outputs = _func(
-        reference_wav,
-        speaker_id,
-        d_vector,
-        reference_speaker_id,
-        reference_d_vector)
+    model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector)
 
     # convert outputs to numpy
     # plot results
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 687794b4..2ea23adb 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -119,7 +119,7 @@ class Synthesizer(object):
         if use_cuda:
             self.tts_model.cuda()
 
-        if self.encoder_checkpoint and  hasattr(self.tts_model, "speaker_manager"):
+        if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"):
             self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config)
 
     def _set_speaker_encoder_paths_from_tts_config(self):
@@ -199,8 +199,8 @@ class Synthesizer(object):
 
         if not text and not reference_wav:
             raise ValueError(
-                    "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
-                )
+                "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
+            )
 
         if text:
             sens = self.split_into_sentences(text)
@@ -214,7 +214,9 @@ class Synthesizer(object):
             if speaker_name and isinstance(speaker_name, str):
                 if self.tts_config.use_d_vector_file:
                     # get the average speaker embedding from the saved d_vectors.
-                    speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False)
+                    speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(
+                        speaker_name, num_samples=None, randomize=False
+                    )
                     speaker_embedding = np.array(speaker_embedding)[None, :]  # [1 x embedding_dim]
                 else:
                     # get speaker idx from the speaker name
@@ -315,25 +317,31 @@ class Synthesizer(object):
                 if reference_speaker_name and isinstance(reference_speaker_name, str):
                     if self.tts_config.use_d_vector_file:
                         # get the speaker embedding from the saved d_vectors.
-                        reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(reference_speaker_name)[0]
-                        reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :]  # [1 x embedding_dim]
+                        reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(
+                            reference_speaker_name
+                        )[0]
+                        reference_speaker_embedding = np.array(reference_speaker_embedding)[
+                            None, :
+                        ]  # [1 x embedding_dim]
                     else:
                         # get speaker idx from the speaker name
                         reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name]
                 else:
-                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(reference_wav)
+                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(
+                        reference_wav
+                    )
 
             outputs = transfer_voice(
-                    model=self.tts_model,
-                    CONFIG=self.tts_config,
-                    use_cuda=self.use_cuda,
-                    reference_wav=reference_wav,
-                    speaker_id=speaker_id,
-                    d_vector=speaker_embedding,
-                    use_griffin_lim=use_gl,
-                    reference_speaker_id=reference_speaker_id,
-                    reference_d_vector=reference_speaker_embedding
-                )
+                model=self.tts_model,
+                CONFIG=self.tts_config,
+                use_cuda=self.use_cuda,
+                reference_wav=reference_wav,
+                speaker_id=speaker_id,
+                d_vector=speaker_embedding,
+                use_griffin_lim=use_gl,
+                reference_speaker_id=reference_speaker_id,
+                reference_d_vector=reference_speaker_embedding,
+            )
             waveform = outputs
             if not use_gl:
                 mel_postnet_spec = outputs[0].detach().cpu().numpy()
diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py
index 6a739009..b4cbae63 100644
--- a/recipes/ljspeech/hifigan/train_hifigan.py
+++ b/recipes/ljspeech/hifigan/train_hifigan.py
@@ -41,11 +41,6 @@ model = GAN(config, ap)
 
 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
index d5ca9a76..225f5a30 100644
--- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
+++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
@@ -41,11 +41,6 @@ model = GAN(config, ap)
 
 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
index a0ff8b02..04e6150e 100644
--- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
@@ -84,11 +84,6 @@ model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
 
 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py
index 592b9a76..81d2b889 100644
--- a/recipes/ljspeech/univnet/train.py
+++ b/recipes/ljspeech/univnet/train.py
@@ -40,11 +40,6 @@ model = GAN(config, ap)
 
 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py
index c4ed0dda..26eb46be 100644
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@@ -6,12 +6,11 @@ from trainer import Trainer, TrainerArgs
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
-from TTS.tts.models.vits import CharactersConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import Vits, VitsArgs
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs
 from TTS.tts.utils.languages import LanguageManager
-from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
 output_path = os.path.dirname(os.path.abspath(__file__))
@@ -131,11 +130,6 @@ model = Vits(config, ap, tokenizer, speaker_manager, language_manager)
 
 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py
index c888c629..42f1bfd5 100644
--- a/tests/data_tests/test_samplers.py
+++ b/tests/data_tests/test_samplers.py
@@ -1,14 +1,13 @@
 import functools
-
 import unittest
 
 import torch
 
 from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.encoder.utils.samplers import PerfectBatchSampler
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.languages import get_language_balancer_weights
 from TTS.tts.utils.speakers import get_speaker_balancer_weights
-from TTS.encoder.utils.samplers import PerfectBatchSampler
 
 # Fixing random state to avoid random fails
 torch.manual_seed(0)
@@ -60,7 +59,9 @@ class TestSamplers(unittest.TestCase):
         assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced"
 
     def test_language_weighted_random_sampler(self):  # pylint: disable=no-self-use
-        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_language_balancer_weights(train_samples), len(train_samples))
+        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
+            get_language_balancer_weights(train_samples), len(train_samples)
+        )
         ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
         en, pt = 0, 0
         for index in ids:
@@ -73,7 +74,9 @@ class TestSamplers(unittest.TestCase):
 
     def test_speaker_weighted_random_sampler(self):  # pylint: disable=no-self-use
 
-        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_speaker_balancer_weights(train_samples), len(train_samples))
+        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
+            get_speaker_balancer_weights(train_samples), len(train_samples)
+        )
         ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
         spk1, spk2 = 0, 0
         for index in ids:
@@ -92,11 +95,12 @@ class TestSamplers(unittest.TestCase):
         sampler = PerfectBatchSampler(
             train_samples,
             classes,
-            batch_size=2 * 3, # total batch size
+            batch_size=2 * 3,  # total batch size
             num_classes_in_batch=2,
             label_key="speaker_name",
             shuffle=False,
-            drop_last=True)
+            drop_last=True,
+        )
         batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
         for batch in batchs:
             spk1, spk2 = 0, 0
@@ -116,11 +120,12 @@ class TestSamplers(unittest.TestCase):
         sampler = PerfectBatchSampler(
             train_samples,
             classes,
-            batch_size=2 * 3, # total batch size
+            batch_size=2 * 3,  # total batch size
             num_classes_in_batch=2,
             label_key="speaker_name",
             shuffle=True,
-            drop_last=False)
+            drop_last=False,
+        )
         batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
         for batch in batchs:
             spk1, spk2 = 0, 0

From fd56fabb21db87059c27c0d772e6948ffc129a14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 16 Mar 2022 12:38:27 +0100
Subject: [PATCH 16/38] Fix #1380 (#1409)

---
 TTS/tts/datasets/formatters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 3e963d0c..c13fcdb8 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -246,7 +246,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None):
                         continue
                 items.append({"text": text, "audio_file": wav_file, "speaker_name": f"LTTS_{speaker_name}"})
     for item in items:
-        assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
+        assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}"
     return items
 
 

From c7f9ec07c86031126e6eddcbbb45bd906d0425e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Fri, 18 Mar 2022 16:47:50 +0100
Subject: [PATCH 17/38] Hinge Gruut version to 2.2.3 (#1419)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e3871874..c3599220 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -33,6 +33,6 @@ pypinyin
 mecab-python3==1.0.3
 unidic-lite==1.0.8
 # gruut+supported langs
-gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0
+gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
 # others
 webrtcvad # for VAD

From 2e6e8f651d1a8330f8bf6e5b19307d838f0708e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Fri, 18 Mar 2022 16:48:24 +0100
Subject: [PATCH 18/38] Update CheckSpectrograms notebook (#1418)

---
 .../dataset_analysis/CheckSpectrograms.ipynb  | 222 ++++++++++--------
 1 file changed, 126 insertions(+), 96 deletions(-)

diff --git a/notebooks/dataset_analysis/CheckSpectrograms.ipynb b/notebooks/dataset_analysis/CheckSpectrograms.ipynb
index 74ca51ab..47e5c4cf 100644
--- a/notebooks/dataset_analysis/CheckSpectrograms.ipynb
+++ b/notebooks/dataset_analysis/CheckSpectrograms.ipynb
@@ -3,6 +3,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
    "source": [
     "%matplotlib inline\n",
     "\n",
@@ -12,21 +16,51 @@
     "\n",
     "import IPython.display as ipd\n",
     "import glob"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
    "source": [
-    "config_path = \"/home/erogol/gdrive/Projects/TTS/recipes/ljspeech/align_tts/config_transformer2.json\"\n",
-    "data_path = \"/home/erogol/gdrive/Datasets/LJSpeech-1.1/\"\n",
-    "\n",
-    "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
-    "CONFIG = load_config(config_path)\n",
+    "from TTS.config.shared_configs import BaseAudioConfig\n",
+    "CONFIG = BaseAudioConfig()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ✍️ Set these values "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = \"/root/wav48_silence_trimmed/\"\n",
+    "file_ext = \".flac\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read audio files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_paths = glob.glob(data_path + f\"/**/*{file_ext}\", recursive=True)\n",
     "\n",
     "# Change this to the index of the desired file listed below\n",
     "sample_file_index = 10\n",
@@ -35,44 +69,45 @@
     "\n",
     "print(\"File list, by index:\")\n",
     "dict(enumerate(file_paths))"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "Collapsed": "false"
+   },
    "source": [
-    "### Setup Audio Processor\n",
+    "## ✍️ Set Audio Processor\n",
     "Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
     "\n",
     "The default values are loaded from your config.json file, so you only need to\n",
     "uncomment and modify values below that you'd like to tune."
-   ],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
    "source": [
     "tune_params={\n",
-    "#  'audio_processor': 'audio',\n",
-    "#  'num_mels': 80,          # In general, you don't need to change this. \n",
-    "#  'fft_size': 1024,        # In general, you don't need to change this.\n",
-    "#  'sample_rate': 22050,    # This must match the sample rate of the dataset.\n",
-    "#  'hop_length': 256,       # In general, you don't need to change this.\n",
-    "#  'win_length': 1024,      # In general, you don't need to change this.\n",
-    "#  'preemphasis': 0.98,     # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
-    "#  'min_level_db': -100,\n",
-    "#  'ref_level_db': 0,       # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
-    "#  'power': 1.5,            # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
-    "#  'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
-    "#  'mel_fmin': 0.0,         # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
-    "#  'mel_fmax': 8000.0,      # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
-    "#  'do_trim_silence': True  # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
+    " 'num_mels': 80,          # In general, you don't need to change this. \n",
+    " 'fft_size': 2400,        # In general, you don't need to change this.\n",
+    " 'frame_length_ms': 50, \n",
+    " 'frame_shift_ms': 12.5,\n",
+    " 'sample_rate': 48000,    # This must match the sample rate of the dataset.\n",
+    " 'hop_length': None,       # In general, you don't need to change this.\n",
+    " 'win_length': 1024,      # In general, you don't need to change this.\n",
+    " 'preemphasis': 0.98,     # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
+    " 'min_level_db': -100,\n",
+    " 'ref_level_db': 0,       # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
+    " 'power': 1.5,            # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
+    " 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
+    " 'mel_fmin': 0.0,         # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
+    " 'mel_fmax': 8000.0,      # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
+    " 'do_trim_silence': True  # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
     "}\n",
     "\n",
     "# These options have to be forced off in order to avoid errors about the \n",
@@ -86,59 +121,57 @@
     "}\n",
     "\n",
     "# Override select parts of loaded config with parameters above\n",
-    "tuned_config = CONFIG.audio.copy()\n",
+    "tuned_config = CONFIG.copy()\n",
     "tuned_config.update(reset)\n",
     "tuned_config.update(tune_params)\n",
     "\n",
     "AP = AudioProcessor(**tuned_config);"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "markdown",
-   "source": [
-    "### Check audio loading "
-   ],
    "metadata": {
     "Collapsed": "false"
-   }
+   },
+   "source": [
+    "### Check audio loading "
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
    "source": [
     "wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
     "ipd.Audio(data=wav, rate=AP.sample_rate) "
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "markdown",
-   "source": [
-    "### Generate Mel-Spectrogram and Re-synthesis with GL"
-   ],
    "metadata": {
     "Collapsed": "false"
-   }
+   },
+   "source": [
+    "### Generate Mel-Spectrogram and Re-synthesis with GL"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "AP.power = 1.5"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "mel = AP.melspectrogram(wav)\n",
     "print(\"Max:\", mel.max())\n",
@@ -148,24 +181,24 @@
     "\n",
     "wav_gen = AP.inv_melspectrogram(mel)\n",
     "ipd.Audio(wav_gen, rate=AP.sample_rate)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "markdown",
-   "source": [
-    "### Generate Linear-Spectrogram and Re-synthesis with GL"
-   ],
    "metadata": {
     "Collapsed": "false"
-   }
+   },
+   "source": [
+    "### Generate Linear-Spectrogram and Re-synthesis with GL"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
    "source": [
     "spec = AP.spectrogram(wav)\n",
     "print(\"Max:\", spec.max())\n",
@@ -175,26 +208,26 @@
     "\n",
     "wav_gen = AP.inv_spectrogram(spec)\n",
     "ipd.Audio(wav_gen, rate=AP.sample_rate)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "Collapsed": "false"
+   },
    "source": [
     "### Compare values for a certain parameter\n",
     "\n",
     "Optimize your parameters by comparing different values per parameter at a time."
-   ],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
    "source": [
     "from librosa import display\n",
     "from matplotlib import pylab as plt\n",
@@ -234,39 +267,39 @@
     "        val = values[idx]\n",
     "        print(\" > {} = {}\".format(attribute, val))\n",
     "        IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
    "source": [
     "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "source": [
-    "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
-   ],
-   "outputs": [],
    "metadata": {
     "Collapsed": "false"
-   }
+   },
+   "outputs": [],
+   "source": [
+    "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
+   ]
   }
  ],
  "metadata": {
+  "interpreter": {
+   "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
+  },
   "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.8.5 64-bit ('torch': conda)"
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -278,12 +311,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
-  },
-  "interpreter": {
-   "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}

From ccdc2300dc91ced60f93808eae56aef15e92cd96 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Tue, 22 Mar 2022 08:54:41 -0300
Subject: [PATCH 19/38] Add eval_split and eval_split_size in the call of
 load_tts_samples for all recipes (#1424)

---
 recipes/ljspeech/align_tts/train_aligntts.py          | 2 +-
 recipes/ljspeech/fast_pitch/train_fast_pitch.py       | 2 +-
 recipes/ljspeech/fast_speech/train_fast_speech.py     | 2 +-
 recipes/ljspeech/glow_tts/train_glowtts.py            | 2 +-
 recipes/ljspeech/speedy_speech/train_speedy_speech.py | 2 +-
 recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py  | 2 +-
 recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py  | 2 +-
 recipes/ljspeech/vits_tts/train_vits.py               | 2 +-
 recipes/multilingual/vits_tts/train_vits_tts.py       | 2 +-
 recipes/vctk/fast_pitch/train_fast_pitch.py           | 2 +-
 recipes/vctk/fast_speech/train_fast_speech.py         | 2 +-
 recipes/vctk/glow_tts/train_glow_tts.py               | 2 +-
 recipes/vctk/speedy_speech/train_speedy_speech.py     | 2 +-
 recipes/vctk/tacotron-DDC/train_tacotron-DDC.py       | 2 +-
 recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py     | 2 +-
 recipes/vctk/tacotron2/train_tacotron2.py             | 2 +-
 recipes/vctk/vits/train_vits.py                       | 2 +-
 17 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py
index f1b29025..d27d0fa1 100644
--- a/recipes/ljspeech/align_tts/train_aligntts.py
+++ b/recipes/ljspeech/align_tts/train_aligntts.py
@@ -49,7 +49,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init model
 model = AlignTTS(config, ap, tokenizer)
diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
index a3fc35c9..1f10ef07 100644
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@@ -84,7 +84,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init the model
 model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)
diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py
index 560d3de2..e5a601a7 100644
--- a/recipes/ljspeech/fast_speech/train_fast_speech.py
+++ b/recipes/ljspeech/fast_speech/train_fast_speech.py
@@ -83,7 +83,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init the model
 model = ForwardTTS(config, ap, tokenizer)
diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py
index c47cd00a..47d03fe3 100644
--- a/recipes/ljspeech/glow_tts/train_glowtts.py
+++ b/recipes/ljspeech/glow_tts/train_glowtts.py
@@ -60,7 +60,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
index 7ad132b2..a19e9053 100644
--- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py
+++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@@ -67,7 +67,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init model
 model = ForwardTTS(config, ap, tokenizer)
diff --git a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
index ea1b0874..19a9f315 100644
--- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
+++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
@@ -77,7 +77,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
index 04e6150e..029698d8 100644
--- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
@@ -74,7 +74,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py
index cfb3351d..e38dc200 100644
--- a/recipes/ljspeech/vits_tts/train_vits.py
+++ b/recipes/ljspeech/vits_tts/train_vits.py
@@ -69,7 +69,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init model
 model = Vits(config, ap, tokenizer, speaker_manager=None)
diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py
index 26eb46be..9e0cb4c8 100644
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@@ -109,7 +109,7 @@ config.from_dict(config.to_dict())
 ap = AudioProcessor(**config.audio.to_dict())
 
 # load training samples
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py
index 986202c5..d066a539 100644
--- a/recipes/vctk/fast_pitch/train_fast_pitch.py
+++ b/recipes/vctk/fast_pitch/train_fast_pitch.py
@@ -71,7 +71,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py
index fe785a41..dbe23351 100644
--- a/recipes/vctk/fast_speech/train_fast_speech.py
+++ b/recipes/vctk/fast_speech/train_fast_speech.py
@@ -69,7 +69,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py
index ebdbfb37..8a891e5d 100644
--- a/recipes/vctk/glow_tts/train_glow_tts.py
+++ b/recipes/vctk/glow_tts/train_glow_tts.py
@@ -69,7 +69,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py
index 80d21ca2..d9353af2 100644
--- a/recipes/vctk/speedy_speech/train_speedy_speech.py
+++ b/recipes/vctk/speedy_speech/train_speedy_speech.py
@@ -69,7 +69,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
index bed21ad9..14007239 100644
--- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
+++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
@@ -72,7 +72,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
index caa745b3..ab2e1bc9 100644
--- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
+++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
@@ -78,7 +78,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py
index 43f5d4e6..48934e2a 100644
--- a/recipes/vctk/tacotron2/train_tacotron2.py
+++ b/recipes/vctk/tacotron2/train_tacotron2.py
@@ -78,7 +78,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py
index 84e8a058..443dbbd1 100644
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@@ -79,7 +79,7 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader

From 72d85e53c98b908345bbff70f7cfba2174e883ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Tue, 22 Mar 2022 17:55:00 +0100
Subject: [PATCH 20/38] Update model file extension (#1422)

* Update model file ext to ```.pth```

* Update docs

* Rename more

* Find model files
---
 .gitignore                                    |  1 +
 README.md                                     |  8 +--
 TTS/bin/compute_attention_masks.py            |  2 +-
 TTS/bin/compute_embeddings.py                 |  2 +-
 TTS/bin/distribute.py                         | 55 -------------------
 TTS/bin/eval_encoder.py                       |  2 +-
 TTS/bin/synthesize.py                         |  8 +--
 TTS/encoder/README.md                         |  2 +-
 TTS/encoder/utils/generic_utils.py            |  4 +-
 TTS/encoder/utils/io.py                       |  4 +-
 TTS/server/README.md                          |  2 +-
 TTS/server/conf.json                          |  2 +-
 TTS/utils/audio.py                            |  2 +-
 TTS/utils/generic_utils.py                    |  2 +-
 TTS/utils/io.py                               |  8 +--
 TTS/utils/manage.py                           | 31 ++++++++++-
 TTS/vocoder/README.md                         |  2 +-
 docs/source/finetuning.md                     |  6 +-
 docs/source/inference.md                      |  6 +-
 docs/source/training_a_model.md               |  4 +-
 docs/source/tutorial_for_nervous_beginners.md |  4 +-
 notebooks/ExtractTTSpectrogram.ipynb          |  2 +-
 notebooks/PlotUmapLibriTTS.ipynb              |  2 +-
 notebooks/TestAttention.ipynb                 |  2 +-
 .../dataset_analysis/AnalyzeDataset.ipynb     |  2 +-
 .../test_extract_tts_spectrograms.py          |  6 +-
 tests/aux_tests/test_speaker_manager.py       |  2 +-
 tests/inference_tests/test_synthesizer.py     |  2 +-
 tests/inputs/server_config.json               |  2 +-
 29 files changed, 74 insertions(+), 103 deletions(-)
 delete mode 100644 TTS/bin/distribute.py

diff --git a/.gitignore b/.gitignore
index f8d6e644..2a3cbad4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -115,6 +115,7 @@ venv.bak/
 *.swo
 
 # pytorch models
+*.pth
 *.pth.tar
 result/
 
diff --git a/README.md b/README.md
index 80fa5dea..97a7cc66 100644
--- a/README.md
+++ b/README.md
@@ -159,13 +159,13 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 - Run your own TTS model (Using Griffin-Lim Vocoder):
 
     ```
-    $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
     ```
 
 - Run your own TTS and Vocoder models:
     ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
-        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
+        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
     ```
 
 ### Multi-speaker Models
@@ -185,7 +185,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 - Run your own multi-speaker TTS model:
 
     ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
     ```
 
 ## Directory Structure
diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
index e58259a6..9ab520be 100644
--- a/TTS/bin/compute_attention_masks.py
+++ b/TTS/bin/compute_attention_masks.py
@@ -25,7 +25,7 @@ These masks can be used for different purposes including training a TTS model wi
         """
 Example run:
     CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
-        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
+        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
         --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
         --dataset_metafile metadata.csv
         --data_path /root/LJSpeech-1.1/
diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index 68571fb4..d7a2c5f6 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -12,7 +12,7 @@ parser = argparse.ArgumentParser(
     description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
     """
     Example runs:
-    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json  dataset_config.json embeddings_output_path/
+    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json embeddings_output_path/
     """,
     formatter_class=RawTextHelpFormatter,
 )
diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py
deleted file mode 100644
index b5552e32..00000000
--- a/TTS/bin/distribute.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import os
-import pathlib
-import subprocess
-import time
-
-import torch
-from trainer import TrainerArgs
-
-
-def main():
-    """
-    Call train.py as a new process and pass command arguments
-    """
-    parser = TrainerArgs().init_argparse(arg_prefix="")
-    parser.add_argument("--script", type=str, help="Target training script to distibute.")
-    args, unargs = parser.parse_known_args()
-
-    num_gpus = torch.cuda.device_count()
-    group_id = time.strftime("%Y_%m_%d-%H%M%S")
-
-    # set arguments for train.py
-    folder_path = pathlib.Path(__file__).parent.absolute()
-    if os.path.exists(os.path.join(folder_path, args.script)):
-        command = [os.path.join(folder_path, args.script)]
-    else:
-        command = [args.script]
-    command.append("--continue_path={}".format(args.continue_path))
-    command.append("--restore_path={}".format(args.restore_path))
-    command.append("--config_path={}".format(args.config_path))
-    command.append("--group_id=group_{}".format(group_id))
-    command.append("--use_ddp=true")
-    command += unargs
-    command.append("")
-
-    # run a processes per GPU
-    processes = []
-    for i in range(num_gpus):
-        my_env = os.environ.copy()
-        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
-        command[-1] = "--rank={}".format(i)
-        # prevent stdout for processes with rank != 0
-        stdout = None
-        p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env)  # pylint: disable=consider-using-with
-        processes.append(p)
-        print(command)
-
-    for p in processes:
-        p.wait()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
index de9e5865..089f3645 100644
--- a/TTS/bin/eval_encoder.py
+++ b/TTS/bin/eval_encoder.py
@@ -56,7 +56,7 @@ if __name__ == "__main__":
         description="""Compute the accuracy of the encoder.\n\n"""
         """
         Example runs:
-        python TTS/bin/eval_encoder.py emotion_encoder_model.pth.tar emotion_encoder_config.json  dataset_config.json
+        python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json  dataset_config.json
         """,
         formatter_class=RawTextHelpFormatter,
     )
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 8b3f53db..eb166bc8 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -60,13 +60,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
 - Run your own TTS model (Using Griffin-Lim Vocoder):
 
     ```
-    $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
     ```
 
 - Run your own TTS and Vocoder models:
     ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
-        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
+        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
     ```
 
 ### Multi-speaker Models
@@ -86,7 +86,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
 - Run your own multi-speaker TTS model:
 
     ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
     ```
     """
     # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
diff --git a/TTS/encoder/README.md b/TTS/encoder/README.md
index b6f541f8..b38b2005 100644
--- a/TTS/encoder/README.md
+++ b/TTS/encoder/README.md
@@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS.
 
 - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
 - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
-- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
 - Watch training on Tensorboard as in TTS
diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
index 17f1c3d9..19c00582 100644
--- a/TTS/encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@@ -147,7 +147,7 @@ def setup_speaker_encoder_model(config: "Coqpit"):
 
 
 def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
-    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = "checkpoint_{}.pth".format(current_step)
     checkpoint_path = os.path.join(out_path, checkpoint_path)
     print(" | | > Checkpoint saving : {}".format(checkpoint_path))
 
@@ -177,7 +177,7 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path
             "date": datetime.date.today().strftime("%B %d, %Y"),
         }
         best_loss = model_loss
-        bestmodel_path = "best_model.pth.tar"
+        bestmodel_path = "best_model.pth"
         bestmodel_path = os.path.join(out_path, bestmodel_path)
         print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
         save_fsspec(state, bestmodel_path)
diff --git a/TTS/encoder/utils/io.py b/TTS/encoder/utils/io.py
index 7a3aadc9..d1dad3e2 100644
--- a/TTS/encoder/utils/io.py
+++ b/TTS/encoder/utils/io.py
@@ -5,7 +5,7 @@ from TTS.utils.io import save_fsspec
 
 
 def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
-    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = "checkpoint_{}.pth".format(current_step)
     checkpoint_path = os.path.join(out_path, checkpoint_path)
     print(" | | > Checkpoint saving : {}".format(checkpoint_path))
 
@@ -31,7 +31,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_s
             "date": datetime.date.today().strftime("%B %d, %Y"),
         }
         best_loss = model_loss
-        bestmodel_path = "best_model.pth.tar"
+        bestmodel_path = "best_model.pth"
         bestmodel_path = os.path.join(out_path, bestmodel_path)
         print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
         save_fsspec(state, bestmodel_path)
diff --git a/TTS/server/README.md b/TTS/server/README.md
index 89ee21eb..5458e398 100644
--- a/TTS/server/README.md
+++ b/TTS/server/README.md
@@ -21,4 +21,4 @@ Run the server with the official models on a GPU.
 ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
 
 Run the server with a custom models.
-```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json```
+```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
diff --git a/TTS/server/conf.json b/TTS/server/conf.json
index 32e475cf..49b6c09c 100644
--- a/TTS/server/conf.json
+++ b/TTS/server/conf.json
@@ -1,6 +1,6 @@
 {
     "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/",  // tts model root folder
-    "tts_file":"best_model.pth.tar",     // tts checkpoint file
+    "tts_file":"best_model.pth",     // tts checkpoint file
     "tts_config":"config.json",     // tts config.json file
     "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
     "vocoder_config":null,
diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py
index d0777c11..3ed0a76a 100644
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@@ -371,7 +371,7 @@ class AudioProcessor(object):
             self.hop_length = hop_length
             self.win_length = win_length
         assert min_level_db != 0.0, " [!] min_level_db is 0"
-        assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
+        assert self.win_length <= self.fft_size, f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
         members = vars(self)
         if verbose:
             print(" > Setting up Audio Processor...")
diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py
index 69609bcb..b685210c 100644
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@@ -67,7 +67,7 @@ def get_experiment_folder_path(root_path, model_name):
 def remove_experiment_folder(experiment_path):
     """Check folder if there is a checkpoint, otherwise remove the folder"""
     fs = fsspec.get_mapper(experiment_path).fs
-    checkpoint_files = fs.glob(experiment_path + "/*.pth.tar")
+    checkpoint_files = fs.glob(experiment_path + "/*.pth")
     if not checkpoint_files:
         if fs.exists(experiment_path):
             fs.rm(experiment_path, recursive=True)
diff --git a/TTS/utils/io.py b/TTS/utils/io.py
index 54818ce9..304df5ed 100644
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@@ -140,7 +140,7 @@ def save_checkpoint(
     output_folder,
     **kwargs,
 ):
-    file_name = "checkpoint_{}.pth.tar".format(current_step)
+    file_name = "checkpoint_{}.pth".format(current_step)
     checkpoint_path = os.path.join(output_folder, file_name)
     print("\n > CHECKPOINT : {}".format(checkpoint_path))
     save_model(
@@ -170,7 +170,7 @@ def save_best_model(
     **kwargs,
 ):
     if current_loss < best_loss:
-        best_model_name = f"best_model_{current_step}.pth.tar"
+        best_model_name = f"best_model_{current_step}.pth"
         checkpoint_path = os.path.join(out_path, best_model_name)
         print(" > BEST MODEL : {}".format(checkpoint_path))
         save_model(
@@ -187,12 +187,12 @@ def save_best_model(
         fs = fsspec.get_mapper(out_path).fs
         # only delete previous if current is saved successfully
         if not keep_all_best or (current_step < keep_after):
-            model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar"))
+            model_names = fs.glob(os.path.join(out_path, "best_model*.pth"))
             for model_name in model_names:
                 if os.path.basename(model_name) != best_model_name:
                     fs.rm(model_name)
         # create a shortcut which always points to the currently best model
-        shortcut_name = "best_model.pth.tar"
+        shortcut_name = "best_model.pth"
         shortcut_path = os.path.join(out_path, shortcut_name)
         fs.copy(checkpoint_path, shortcut_path)
         best_loss = current_loss
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 01d54ad6..dd397687 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -3,6 +3,7 @@ import json
 import os
 import zipfile
 from pathlib import Path
+from typing import Tuple
 from shutil import copyfile, rmtree
 
 import requests
@@ -114,7 +115,7 @@ class ModelManager(object):
             e.g. 'tts_model/en/ljspeech/tacotron'
 
         Every model must have the following files:
-            - *.pth.tar : pytorch model checkpoint file.
+            - *.pth : pytorch model checkpoint file.
             - config.json : model config file.
             - scale_stats.npy (if exist): scale values for preprocessing.
 
@@ -127,7 +128,7 @@ class ModelManager(object):
         model_item = self.models_dict[model_type][lang][dataset][model]
         # set the model specific output path
         output_path = os.path.join(self.output_prefix, model_full_name)
-        output_model_path = os.path.join(output_path, "model_file.pth.tar")
+        output_model_path = os.path.join(output_path, "model_file.pth")
         output_config_path = os.path.join(output_path, "config.json")
 
         if os.path.exists(output_path):
@@ -139,8 +140,32 @@ class ModelManager(object):
             self._download_zip_file(model_item["github_rls_url"], output_path)
         # update paths in the config.json
         self._update_paths(output_path, output_config_path)
+        # find downloaded files
+        output_model_path, output_config_path = self._find_files(output_path)
         return output_model_path, output_config_path, model_item
 
+    def _find_files(self, output_path:str) -> Tuple[str, str]:
+        """Find the model and config files in the output path
+
+        Args:
+            output_path (str): path to the model files
+
+        Returns:
+            Tuple[str, str]: path to the model file and config file
+        """
+        model_file = None
+        config_file = None
+        for file_name in os.listdir(output_path):
+            if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]:
+                model_file = os.path.join(output_path, file_name)
+            elif file_name == "config.json":
+                config_file = os.path.join(output_path, file_name)
+        if model_file is None:
+            raise ValueError(" [!] Model file not found in the output path")
+        if config_file is None:
+            raise ValueError(" [!] Config file not found in the output path")
+        return model_file, config_file
+
     def _update_paths(self, output_path: str, config_path: str) -> None:
         """Update paths for certain files in config.json after download.
 
@@ -152,7 +177,7 @@ class ModelManager(object):
         output_d_vector_file_path = os.path.join(output_path, "speakers.json")
         output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
         speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
-        speaker_encoder_model_path = os.path.join(output_path, "model_se.pth.tar")
+        speaker_encoder_model_path = os.path.join(output_path, "model_se.pth")
 
         # update the scale_path.npy file path in the model config.json
         self._update_path("audio.stats_path", output_stats_path, config_path)
diff --git a/TTS/vocoder/README.md b/TTS/vocoder/README.md
index e0ae8f21..b9fb17c8 100644
--- a/TTS/vocoder/README.md
+++ b/TTS/vocoder/README.md
@@ -29,7 +29,7 @@ You can continue a previous training run by the following command.
 
 You can fine-tune a pre-trained model by the following command.
 
-```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar```
+```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth```
 
 Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.
 
diff --git a/docs/source/finetuning.md b/docs/source/finetuning.md
index 7d7ef1cb..fd97daa5 100644
--- a/docs/source/finetuning.md
+++ b/docs/source/finetuning.md
@@ -93,13 +93,13 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
 
     ```bash
     CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
     ```
 
     ```bash
     CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \
         --config_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
     ```
 
     As stated above, you can also use command-line arguments to change the model configuration.
@@ -107,7 +107,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways:
 
     ```bash
     CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
         --coqpit.run_name "glow-tts-finetune" \
         --coqpit.lr 0.00001
     ```
diff --git a/docs/source/inference.md b/docs/source/inference.md
index 544473bf..1057d04d 100644
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@@ -44,7 +44,7 @@ Run your own TTS model (Using Griffin-Lim Vocoder)
 
 ```bash
 tts --text "Text for TTS" \
-    --model_path path/to/model.pth.tar \
+    --model_path path/to/model.pth \
     --config_path path/to/config.json \
     --out_path folder/to/save/output.wav
 ```
@@ -54,9 +54,9 @@ Run your own TTS and Vocoder models
 ```bash
 tts --text "Text for TTS" \
     --config_path path/to/config.json \
-    --model_path path/to/model.pth.tar \
+    --model_path path/to/model.pth \
     --out_path folder/to/save/output.wav \
-    --vocoder_path path/to/vocoder.pth.tar \
+    --vocoder_path path/to/vocoder.pth \
     --vocoder_config_path path/to/vocoder_config.json
 ```
 
diff --git a/docs/source/training_a_model.md b/docs/source/training_a_model.md
index a28710d0..22090f6e 100644
--- a/docs/source/training_a_model.md
+++ b/docs/source/training_a_model.md
@@ -33,7 +33,7 @@
     If you like to run a multi-gpu training using DDP back-end,
 
     ```bash
-    $ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py
+    $ CUDA_VISIBLE_DEVICES="0, 1, 2" python -m trainer.distribute --script <path_to_your_script>/train_glowtts.py
     ```
 
     The example above runs a multi-gpu training using GPUs `0, 1, 2`.
@@ -122,7 +122,7 @@
 
     ```bash
     $ tts --text "Text for TTS" \
-          --model_path path/to/checkpoint_x.pth.tar \
+          --model_path path/to/checkpoint_x.pth \
           --config_path path/to/config.json \
           --out_path folder/to/save/output.wav
     ```
diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md
index fa09cb7d..d2d3c4bb 100644
--- a/docs/source/tutorial_for_nervous_beginners.md
+++ b/docs/source/tutorial_for_nervous_beginners.md
@@ -50,13 +50,13 @@ A breakdown of a simple script that trains a GlowTTS model on the LJspeech datas
     - Fine-tune a model.
 
         ```bash
-        CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth.tar
+        CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth
         ```
 
     - Run multi-gpu training.
 
         ```bash
-        CUDA_VISIBLE_DEVICES=0,1,2 python TTS/bin/distribute.py --script train.py
+        CUDA_VISIBLE_DEVICES=0,1,2 python -m trainer.distribute --script train.py
         ```
 
 ### CLI Way
diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb
index 50b60ff0..a257b6bf 100644
--- a/notebooks/ExtractTTSpectrogram.ipynb
+++ b/notebooks/ExtractTTSpectrogram.ipynb
@@ -66,7 +66,7 @@
     "DATASET = \"ljspeech\"\n",
     "METADATA_FILE = \"metadata.csv\"\n",
     "CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n",
-    "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth.tar\"\n",
+    "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n",
     "BATCH_SIZE = 32\n",
     "\n",
     "QUANTIZED_WAV = False\n",
diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/PlotUmapLibriTTS.ipynb
index c809a5c4..1e29790b 100644
--- a/notebooks/PlotUmapLibriTTS.ipynb
+++ b/notebooks/PlotUmapLibriTTS.ipynb
@@ -66,7 +66,7 @@
    "outputs": [],
    "source": [
     "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
+    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n",
     "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
     "\n",
     "# My single speaker locations\n",
diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb
index 5d8eed85..b257ff70 100644
--- a/notebooks/TestAttention.ipynb
+++ b/notebooks/TestAttention.ipynb
@@ -73,7 +73,7 @@
     "\n",
     "# Set constants\n",
     "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n",
-    "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
+    "MODEL_PATH = ROOT_PATH + '/best_model.pth'\n",
     "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
     "OUT_FOLDER = './hard_sentences/'\n",
     "CONFIG = load_config(CONFIG_PATH)\n",
diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb
index e08f3ab3..51963847 100644
--- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb
+++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb
@@ -416,7 +416,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.9.5"
   }
  },
  "nbformat": 4,
diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py
index 8c795d58..ef751846 100644
--- a/tests/aux_tests/test_extract_tts_spectrograms.py
+++ b/tests/aux_tests/test_extract_tts_spectrograms.py
@@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
     def test_GlowTTS():
         # set paths
         config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
+        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
         output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
         # load config
         c = load_config(config_path)
@@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
     def test_Tacotron2():
         # set paths
         config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
+        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
         output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
         # load config
         c = load_config(config_path)
@@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
     def test_Tacotron():
         # set paths
         config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
+        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
         output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
         # load config
         c = load_config(config_path)
diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py
index 5fafb56a..57ff6c50 100644
--- a/tests/aux_tests/test_speaker_manager.py
+++ b/tests/aux_tests/test_speaker_manager.py
@@ -12,7 +12,7 @@ from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
 
 encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
-encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar")
+encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
 sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
 sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
 d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py
index d643cb81..b5350b0f 100644
--- a/tests/inference_tests/test_synthesizer.py
+++ b/tests/inference_tests/test_synthesizer.py
@@ -20,7 +20,7 @@ class SynthesizerTest(unittest.TestCase):
     def test_in_out(self):
         self._create_random_model()
         tts_root_path = get_tests_output_path()
-        tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar")
+        tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth")
         tts_config = os.path.join(tts_root_path, "dummy_model_config.json")
         synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
         synthesizer.tts("Better this test works!!")
diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json
index 0cb9b948..f0a92283 100644
--- a/tests/inputs/server_config.json
+++ b/tests/inputs/server_config.json
@@ -1,5 +1,5 @@
 {
-    "tts_checkpoint":"checkpoint_10.pth.tar",     // tts checkpoint file
+    "tts_checkpoint":"checkpoint_10.pth",     // tts checkpoint file
     "tts_config":"dummy_model_config.json",     // tts config.json file
     "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
     "wavernn_lib_path": null,   // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.

From 1c3623af337a61467d3a139a500db247cc8dc755 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 23 Mar 2022 12:57:14 +0100
Subject: [PATCH 21/38] Fix model manager  (#1436)

* Fix manager

* Make style
---
 TTS/tts/utils/text/phonemizers/__init__.py    |  1 +
 TTS/utils/audio.py                            |  4 ++-
 TTS/utils/manage.py                           | 32 +++++++++++++------
 recipes/ljspeech/align_tts/train_aligntts.py  |  7 +++-
 .../ljspeech/fast_pitch/train_fast_pitch.py   |  7 +++-
 .../ljspeech/fast_speech/train_fast_speech.py |  7 +++-
 recipes/ljspeech/glow_tts/train_glowtts.py    |  7 +++-
 .../speedy_speech/train_speedy_speech.py      |  7 +++-
 .../tacotron2-DCA/train_tacotron_dca.py       |  7 +++-
 .../tacotron2-DDC/train_tacotron_ddc.py       |  7 +++-
 recipes/ljspeech/vits_tts/train_vits.py       |  7 +++-
 .../multilingual/vits_tts/train_vits_tts.py   |  7 +++-
 recipes/vctk/fast_pitch/train_fast_pitch.py   |  7 +++-
 recipes/vctk/fast_speech/train_fast_speech.py |  7 +++-
 recipes/vctk/glow_tts/train_glow_tts.py       |  7 +++-
 .../vctk/speedy_speech/train_speedy_speech.py |  7 +++-
 .../vctk/tacotron-DDC/train_tacotron-DDC.py   |  7 +++-
 .../vctk/tacotron2-DDC/train_tacotron2-ddc.py |  7 +++-
 recipes/vctk/tacotron2/train_tacotron2.py     |  7 +++-
 recipes/vctk/vits/train_vits.py               |  7 +++-
 20 files changed, 129 insertions(+), 27 deletions(-)

diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py
index 90a526a7..374d0c8a 100644
--- a/TTS/tts/utils/text/phonemizers/__init__.py
+++ b/TTS/tts/utils/text/phonemizers/__init__.py
@@ -27,6 +27,7 @@ DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
 DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
 DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
 
+
 def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
     """Initiate a phonemizer by name
 
diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py
index 3ed0a76a..4d435162 100644
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@@ -371,7 +371,9 @@ class AudioProcessor(object):
             self.hop_length = hop_length
             self.win_length = win_length
         assert min_level_db != 0.0, " [!] min_level_db is 0"
-        assert self.win_length <= self.fft_size, f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
+        assert (
+            self.win_length <= self.fft_size
+        ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
         members = vars(self)
         if verbose:
             print(" > Setting up Audio Processor...")
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index dd397687..674d5a47 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -3,8 +3,8 @@ import json
 import os
 import zipfile
 from pathlib import Path
-from typing import Tuple
 from shutil import copyfile, rmtree
+from typing import Tuple
 
 import requests
 
@@ -128,9 +128,6 @@ class ModelManager(object):
         model_item = self.models_dict[model_type][lang][dataset][model]
         # set the model specific output path
         output_path = os.path.join(self.output_prefix, model_full_name)
-        output_model_path = os.path.join(output_path, "model_file.pth")
-        output_config_path = os.path.join(output_path, "config.json")
-
         if os.path.exists(output_path):
             print(f" > {model_name} is already downloaded.")
         else:
@@ -138,13 +135,14 @@ class ModelManager(object):
             print(f" > Downloading model to {output_path}")
             # download from github release
             self._download_zip_file(model_item["github_rls_url"], output_path)
-        # update paths in the config.json
-        self._update_paths(output_path, output_config_path)
         # find downloaded files
         output_model_path, output_config_path = self._find_files(output_path)
+        # update paths in the config.json
+        self._update_paths(output_path, output_config_path)
         return output_model_path, output_config_path, model_item
 
-    def _find_files(self, output_path:str) -> Tuple[str, str]:
+    @staticmethod
+    def _find_files(output_path: str) -> Tuple[str, str]:
         """Find the model and config files in the output path
 
         Args:
@@ -166,6 +164,22 @@ class ModelManager(object):
             raise ValueError(" [!] Config file not found in the output path")
         return model_file, config_file
 
+    @staticmethod
+    def _find_speaker_encoder(output_path: str) -> str:
+        """Find the speaker encoder file in the output path
+
+        Args:
+            output_path (str): path to the model files
+
+        Returns:
+            str: path to the speaker encoder file
+        """
+        speaker_encoder_file = None
+        for file_name in os.listdir(output_path):
+            if file_name in ["model_se.pth", "model_se.pth.tar"]:
+                speaker_encoder_file = os.path.join(output_path, file_name)
+        return speaker_encoder_file
+
     def _update_paths(self, output_path: str, config_path: str) -> None:
         """Update paths for certain files in config.json after download.
 
@@ -177,7 +191,7 @@ class ModelManager(object):
         output_d_vector_file_path = os.path.join(output_path, "speakers.json")
         output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
         speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
-        speaker_encoder_model_path = os.path.join(output_path, "model_se.pth")
+        speaker_encoder_model_path = self._find_speaker_encoder(output_path)
 
         # update the scale_path.npy file path in the model config.json
         self._update_path("audio.stats_path", output_stats_path, config_path)
@@ -199,7 +213,7 @@ class ModelManager(object):
     @staticmethod
     def _update_path(field_name, new_path, config_path):
         """Update the path in the model config.json for the current environment after download"""
-        if os.path.exists(new_path):
+        if new_path and os.path.exists(new_path):
             config = load_config(config_path)
             field_names = field_name.split(".")
             if len(field_names) > 1:
diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py
index d27d0fa1..591b1509 100644
--- a/recipes/ljspeech/align_tts/train_aligntts.py
+++ b/recipes/ljspeech/align_tts/train_aligntts.py
@@ -49,7 +49,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init model
 model = AlignTTS(config, ap, tokenizer)
diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
index 1f10ef07..a84658f3 100644
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@@ -84,7 +84,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init the model
 model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)
diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py
index e5a601a7..0245dd93 100644
--- a/recipes/ljspeech/fast_speech/train_fast_speech.py
+++ b/recipes/ljspeech/fast_speech/train_fast_speech.py
@@ -83,7 +83,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init the model
 model = ForwardTTS(config, ap, tokenizer)
diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py
index 47d03fe3..a0b4ac48 100644
--- a/recipes/ljspeech/glow_tts/train_glowtts.py
+++ b/recipes/ljspeech/glow_tts/train_glowtts.py
@@ -60,7 +60,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
index a19e9053..1ab3db1c 100644
--- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py
+++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@@ -67,7 +67,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init model
 model = ForwardTTS(config, ap, tokenizer)
diff --git a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
index 19a9f315..a9f253ea 100644
--- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
+++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
@@ -77,7 +77,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
index 029698d8..99089db8 100644
--- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
@@ -74,7 +74,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py
index e38dc200..c070b3f1 100644
--- a/recipes/ljspeech/vits_tts/train_vits.py
+++ b/recipes/ljspeech/vits_tts/train_vits.py
@@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init model
 model = Vits(config, ap, tokenizer, speaker_manager=None)
diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py
index 9e0cb4c8..94692f00 100644
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@@ -109,7 +109,12 @@ config.from_dict(config.to_dict())
 ap = AudioProcessor(**config.audio.to_dict())
 
 # load training samples
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py
index d066a539..05cdc72a 100644
--- a/recipes/vctk/fast_pitch/train_fast_pitch.py
+++ b/recipes/vctk/fast_pitch/train_fast_pitch.py
@@ -71,7 +71,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py
index dbe23351..a294272a 100644
--- a/recipes/vctk/fast_speech/train_fast_speech.py
+++ b/recipes/vctk/fast_speech/train_fast_speech.py
@@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py
index 8a891e5d..0bf686b1 100644
--- a/recipes/vctk/glow_tts/train_glow_tts.py
+++ b/recipes/vctk/glow_tts/train_glow_tts.py
@@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py
index d9353af2..4208a9b6 100644
--- a/recipes/vctk/speedy_speech/train_speedy_speech.py
+++ b/recipes/vctk/speedy_speech/train_speedy_speech.py
@@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
index 14007239..d67038a4 100644
--- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
+++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
@@ -72,7 +72,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
index ab2e1bc9..b860df85 100644
--- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
+++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
@@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py
index 48934e2a..d27dd78c 100644
--- a/recipes/vctk/tacotron2/train_tacotron2.py
+++ b/recipes/vctk/tacotron2/train_tacotron2.py
@@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py
index 443dbbd1..61d60ca1 100644
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@@ -79,7 +79,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, eval_split_max_size=config.eval_split_max_size, eval_split_size=config.eval_split_size)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
 
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader

From 3c7c14607b0678dc45871d2ec6e5442595983429 Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Wed, 23 Mar 2022 17:23:36 +0100
Subject: [PATCH 22/38] Add formatting tests (#1437)

* Add style checks to `make lint`

* Bump target-version in black config
---
 Makefile       | 2 ++
 pyproject.toml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index d04cd976..69f34c79 100644
--- a/Makefile
+++ b/Makefile
@@ -44,6 +44,8 @@ style:	## update code style.
 
 lint:	## run pylint linter.
 	pylint ${target_dirs}
+	black ${target_dirs} --check
+	isort ${target_dirs} --check-only
 
 system-deps:	## install linux system deps
 	sudo apt-get install -y libsndfile1-dev
diff --git a/pyproject.toml b/pyproject.toml
index 0941a906..b775f12a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ max-line-length=120
 
 [tool.black]
 line-length = 120
-target-version = ['py38']
+target-version = ['py39']
 exclude = '''
 
 (

From 3af01cfe3b5b59281790f158494f3c11f9e7255c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 23 Mar 2022 17:24:20 +0100
Subject: [PATCH 23/38] =?UTF-8?q?Update=20base=20model=20wrt=20?=
 =?UTF-8?q?=F0=9F=91=9F=20(#1406)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 TTS/model.py | 142 +++++----------------------------------------------
 1 file changed, 14 insertions(+), 128 deletions(-)

diff --git a/TTS/model.py b/TTS/model.py
index 39cbeabc..a53b916a 100644
--- a/TTS/model.py
+++ b/TTS/model.py
@@ -1,46 +1,34 @@
-from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple
+from abc import abstractmethod
+from typing import Dict
 
 import torch
 from coqpit import Coqpit
-from torch import nn
+from trainer import TrainerModel
 
 # pylint: skip-file
 
 
-class BaseTrainerModel(ABC, nn.Module):
-    """Abstract 🐸TTS class. Every new 🐸TTS model must inherit this."""
+class BaseTrainerModel(TrainerModel):
+    """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
+
+    Every new 🐸TTS model must inherit it.
+    """
 
     @staticmethod
     @abstractmethod
     def init_from_config(config: Coqpit):
-        """Init the model from given config.
+        """Init the model and all its attributes from the given config.
 
         Override this depending on your model.
         """
         ...
 
-    @abstractmethod
-    def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
-        """Forward ... for the model mainly used in training.
-
-        You can be flexible here and use different number of arguments and argument names since it is intended to be
-        used by `train_step()` without exposing it out of the model.
-
-        Args:
-            input (torch.Tensor): Input tensor.
-            aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs.
-
-        Returns:
-            Dict: Model outputs. Main model output must be named as "model_outputs".
-        """
-        outputs_dict = {"model_outputs": None}
-        ...
-        return outputs_dict
-
     @abstractmethod
     def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
-        """Forward ... for inference.
+        """Forward pass for inference.
+
+        It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
+        is considered to be the main output and you can add any other auxiliary outputs as you want.
 
         We don't use `*kwargs` since it is problematic with the TorchScript API.
 
@@ -55,78 +43,9 @@ class BaseTrainerModel(ABC, nn.Module):
         ...
         return outputs_dict
 
-    def format_batch(self, batch: Dict) -> Dict:
-        """Format batch returned by the data loader before sending it to the model.
-
-        If not implemented, model uses the batch as is.
-        Can be used for data augmentation, feature ectraction, etc.
-        """
-        return batch
-
-    def format_batch_on_device(self, batch: Dict) -> Dict:
-        """Format batch on device before sending it to the model.
-
-        If not implemented, model uses the batch as is.
-        Can be used for data augmentation, feature ectraction, etc.
-        """
-        return batch
-
-    @abstractmethod
-    def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
-        """Perform a single training step. Run the model forward ... and compute losses.
-
-        Args:
-            batch (Dict): Input tensors.
-            criterion (nn.Module): Loss layer designed for the model.
-
-        Returns:
-            Tuple[Dict, Dict]: Model ouputs and computed losses.
-        """
-        outputs_dict = {}
-        loss_dict = {}  # this returns from the criterion
-        ...
-        return outputs_dict, loss_dict
-
-    def train_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
-        """Create visualizations and waveform examples for training.
-
-        For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
-        be projected onto Tensorboard.
-
-        Args:
-            ap (AudioProcessor): audio processor used at training.
-            batch (Dict): Model inputs used at the previous training step.
-            outputs (Dict): Model outputs generated at the previoud training step.
-
-        Returns:
-            Tuple[Dict, np.ndarray]: training plots and output waveform.
-        """
-        ...
-
-    @abstractmethod
-    def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
-        """Perform a single evaluation step. Run the model forward ... and compute losses. In most cases, you can
-        call `train_step()` with no changes.
-
-        Args:
-            batch (Dict): Input tensors.
-            criterion (nn.Module): Loss layer designed for the model.
-
-        Returns:
-            Tuple[Dict, Dict]: Model ouputs and computed losses.
-        """
-        outputs_dict = {}
-        loss_dict = {}  # this returns from the criterion
-        ...
-        return outputs_dict, loss_dict
-
-    def eval_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
-        """The same as `train_log()`"""
-        ...
-
     @abstractmethod
     def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
-        """Load a checkpoint and get ready for training or inference.
+        """Load a model checkpoint gile and get ready for training or inference.
 
         Args:
             config (Coqpit): Model configuration.
@@ -135,36 +54,3 @@ class BaseTrainerModel(ABC, nn.Module):
             strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
         """
         ...
-
-    @staticmethod
-    @abstractmethod
-    def init_from_config(config: Coqpit, samples: List[Dict] = None, verbose=False) -> "BaseTrainerModel":
-        """Init the model from given config.
-
-        Override this depending on your model.
-        """
-        ...
-
-    @abstractmethod
-    def get_data_loader(
-        self, config: Coqpit, assets: Dict, is_eval: True, data_items: List, verbose: bool, num_gpus: int
-    ):
-        ...
-
-    # def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]:
-    #     """Setup an return optimizer or optimizers."""
-    #     ...
-
-    # def get_lr(self) -> Union[float, List[float]]:
-    #     """Return learning rate(s).
-
-    #     Returns:
-    #         Union[float, List[float]]: Model's initial learning rates.
-    #     """
-    #     ...
-
-    # def get_scheduler(self, optimizer: torch.optim.Optimizer):
-    #     ...
-
-    # def get_criterion(self):
-    #     ...

From ea53d6feb3169962bccbbc01b867f8a3bf645e9b Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Tue, 22 Mar 2022 13:53:40 -0300
Subject: [PATCH 24/38] Replace webrtcvad by silero-vad

---
 TTS/bin/remove_silence_using_vad.py |  75 +++++-------
 TTS/utils/vad.py                    | 181 +++++++++-------------------
 requirements.txt                    |   2 -
 3 files changed, 86 insertions(+), 172 deletions(-)

diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
index 9070f2da..a8a60bf8 100755
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@@ -1,51 +1,24 @@
 import argparse
 import glob
-import multiprocessing
 import os
 import pathlib
 
-from tqdm.contrib.concurrent import process_map
-
-from TTS.utils.vad import get_vad_speech_segments, read_wave, write_wave
+from tqdm import tqdm
+from TTS.utils.vad import get_vad_model_and_utils, remove_silence
 
 
-def remove_silence(filepath):
-    output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
+def adjust_path_and_remove_silence(audio_path):
+    output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
     # ignore if the file exists
     if os.path.exists(output_path) and not args.force:
-        return
+        return output_path
 
     # create all directory structure
     pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-    # load wave
-    audio, sample_rate = read_wave(filepath)
+    # remove the silence and save the audio
+    output_path = remove_silence(model_and_utils, audio_path, output_path, trim_just_beginning_and_end=args.trim_just_beginning_and_end, use_cuda=args.use_cuda)
 
-    # get speech segments
-    segments = get_vad_speech_segments(audio, sample_rate, aggressiveness=args.aggressiveness)
-
-    segments = list(segments)
-    num_segments = len(segments)
-    flag = False
-    # create the output wave
-    if num_segments != 0:
-        for i, segment in reversed(list(enumerate(segments))):
-            if i >= 1:
-                if not flag:
-                    concat_segment = segment
-                    flag = True
-                else:
-                    concat_segment = segment + concat_segment
-            else:
-                if flag:
-                    segment = segment + concat_segment
-                # print("Saving: ", output_path)
-                write_wave(output_path, segment, sample_rate)
-                return
-    else:
-        print("> Just Copying the file to:", output_path)
-        # if fail to remove silence just write the file
-        write_wave(output_path, audio, sample_rate)
-        return
+    return output_path
 
 
 def preprocess_audios():
@@ -54,17 +27,24 @@ def preprocess_audios():
     if not args.force:
         print("> Ignoring files that already exist in the output directory.")
 
+    if args.trim_just_beginning_and_end:
+        print("> Trimming just the beginning and the end with nonspeech parts.")
+    else:
+        print("> Trimming all nonspeech parts.")
+
     if files:
         # create threads
-        num_threads = multiprocessing.cpu_count()
-        process_map(remove_silence, files, max_workers=num_threads, chunksize=15)
+        # num_threads = multiprocessing.cpu_count()
+        # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
+        for f in tqdm(files):
+            adjust_path_and_remove_silence(f)
     else:
         print("> No files Found !")
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2"
+        description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
     )
     parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
     parser.add_argument(
@@ -79,11 +59,20 @@ if __name__ == "__main__":
         help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
     )
     parser.add_argument(
-        "-a",
-        "--aggressiveness",
-        type=int,
-        default=2,
-        help="set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.",
+        "-t",
+        "--trim_just_beginning_and_end",
+        type=bool,
+        default=True,
+        help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
+    )
+    parser.add_argument(
+        "-c",
+        "--use_cuda",
+        type=bool,
+        default=False,
+        help="If True use cuda",
     )
     args = parser.parse_args()
+    # load the model and utils
+    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
     preprocess_audios()
diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py
index 923544d0..88790202 100644
--- a/TTS/utils/vad.py
+++ b/TTS/utils/vad.py
@@ -1,144 +1,71 @@
-# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py
-import collections
-import contextlib
-import wave
+import torch
+import torchaudio
 
-import webrtcvad
+def read_audio(path):
+    wav, sr = torchaudio.load(path)
 
+    if wav.size(0) > 1:
+        wav = wav.mean(dim=0, keepdim=True)
 
-def read_wave(path):
-    """Reads a .wav file.
+    return wav.squeeze(0), sr
 
-    Takes the path, and returns (PCM audio data, sample rate).
-    """
-    with contextlib.closing(wave.open(path, "rb")) as wf:
-        num_channels = wf.getnchannels()
-        assert num_channels == 1
-        sample_width = wf.getsampwidth()
-        assert sample_width == 2
-        sample_rate = wf.getframerate()
-        assert sample_rate in (8000, 16000, 32000, 48000)
-        pcm_data = wf.readframes(wf.getnframes())
-        return pcm_data, sample_rate
+def resample_wav(wav, sr, new_sr):
+    wav = wav.unsqueeze(0)
+    transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr)
+    wav = transform(wav)
+    return wav.squeeze(0)
 
+def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False):
+    factor = new_sr / vad_sr
+    new_timestamps = []
+    if just_begging_end:
+        # get just the start and end timestamps
+        new_dict = {'start': int(timestamps[0]['start']*factor), 'end': int(timestamps[-1]['end']*factor)}
+        new_timestamps.append(new_dict)
+    else:
+        for ts in timestamps:
+            # map to the new SR
+            new_dict = {'start': int(ts['start']*factor), 'end': int(ts['end']*factor)}
+            new_timestamps.append(new_dict)
 
-def write_wave(path, audio, sample_rate):
-    """Writes a .wav file.
+    return new_timestamps
 
-    Takes path, PCM audio data, and sample rate.
-    """
-    with contextlib.closing(wave.open(path, "wb")) as wf:
-        wf.setnchannels(1)
-        wf.setsampwidth(2)
-        wf.setframerate(sample_rate)
-        wf.writeframes(audio)
+def get_vad_model_and_utils(use_cuda=False):
+    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                                model='silero_vad',
+                                force_reload=True,
+                                onnx=False)
+    if use_cuda:
+        model = model.cuda()
 
+    get_speech_timestamps, save_audio, _, _, collect_chunks = utils
+    return model, get_speech_timestamps, save_audio, collect_chunks
 
-class Frame(object):
-    """Represents a "frame" of audio data."""
+def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False):
 
-    def __init__(self, _bytes, timestamp, duration):
-        self.bytes = _bytes
-        self.timestamp = timestamp
-        self.duration = duration
+    # get the VAD model and utils functions
+    model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils
 
+    # read ground truth wav and resample the audio for the VAD
+    wav, gt_sample_rate = read_audio(audio_path)
 
-def frame_generator(frame_duration_ms, audio, sample_rate):
-    """Generates audio frames from PCM audio data.
+    # if needed, resample the audio for the VAD model
+    if gt_sample_rate != vad_sample_rate:
+        wav_vad = resample_wav(wav, gt_sample_rate, vad_sample_rate)
+    else:
+        wav_vad = wav
 
-    Takes the desired frame duration in milliseconds, the PCM data, and
-    the sample rate.
+    if use_cuda:
+        wav_vad = wav_vad.cuda()
 
-    Yields Frames of the requested duration.
-    """
-    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
-    offset = 0
-    timestamp = 0.0
-    duration = (float(n) / sample_rate) / 2.0
-    while offset + n < len(audio):
-        yield Frame(audio[offset : offset + n], timestamp, duration)
-        timestamp += duration
-        offset += n
+    # get speech timestamps from full audio file
+    speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768)
 
+    # map the current speech_timestamps to the sample rate of the ground truth audio
+    new_speech_timestamps = map_timestamps_to_new_sr(vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end)
 
-def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
-    """Filters out non-voiced audio frames.
+    # save audio
+    save_audio(out_path,
+            collect_chunks(new_speech_timestamps, wav), sampling_rate=gt_sample_rate)
 
-    Given a webrtcvad.Vad and a source of audio frames, yields only
-    the voiced audio.
-
-    Uses a padded, sliding window algorithm over the audio frames.
-    When more than 90% of the frames in the window are voiced (as
-    reported by the VAD), the collector triggers and begins yielding
-    audio frames. Then the collector waits until 90% of the frames in
-    the window are unvoiced to detrigger.
-
-    The window is padded at the front and back to provide a small
-    amount of silence or the beginnings/endings of speech around the
-    voiced frames.
-
-    Arguments:
-
-    sample_rate - The audio sample rate, in Hz.
-    frame_duration_ms - The frame duration in milliseconds.
-    padding_duration_ms - The amount to pad the window, in milliseconds.
-    vad - An instance of webrtcvad.Vad.
-    frames - a source of audio frames (sequence or generator).
-
-    Returns: A generator that yields PCM audio data.
-    """
-    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
-    # We use a deque for our sliding window/ring buffer.
-    ring_buffer = collections.deque(maxlen=num_padding_frames)
-    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
-    # NOTTRIGGERED state.
-    triggered = False
-
-    voiced_frames = []
-    for frame in frames:
-        is_speech = vad.is_speech(frame.bytes, sample_rate)
-
-        # sys.stdout.write('1' if is_speech else '0')
-        if not triggered:
-            ring_buffer.append((frame, is_speech))
-            num_voiced = len([f for f, speech in ring_buffer if speech])
-            # If we're NOTTRIGGERED and more than 90% of the frames in
-            # the ring buffer are voiced frames, then enter the
-            # TRIGGERED state.
-            if num_voiced > 0.9 * ring_buffer.maxlen:
-                triggered = True
-                # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
-                # We want to yield all the audio we see from now until
-                # we are NOTTRIGGERED, but we have to start with the
-                # audio that's already in the ring buffer.
-                for f, _ in ring_buffer:
-                    voiced_frames.append(f)
-                ring_buffer.clear()
-        else:
-            # We're in the TRIGGERED state, so collect the audio data
-            # and add it to the ring buffer.
-            voiced_frames.append(frame)
-            ring_buffer.append((frame, is_speech))
-            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
-            # If more than 90% of the frames in the ring buffer are
-            # unvoiced, then enter NOTTRIGGERED and yield whatever
-            # audio we've collected.
-            if num_unvoiced > 0.9 * ring_buffer.maxlen:
-                # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
-                triggered = False
-                yield b"".join([f.bytes for f in voiced_frames])
-                ring_buffer.clear()
-                voiced_frames = []
-    # If we have any leftover voiced audio when we run out of input,
-    # yield it.
-    if voiced_frames:
-        yield b"".join([f.bytes for f in voiced_frames])
-
-
-def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_duration_ms=300):
-
-    vad = webrtcvad.Vad(int(aggressiveness))
-    frames = list(frame_generator(30, audio, sample_rate))
-    segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames)
-
-    return segments
+    return out_path
diff --git a/requirements.txt b/requirements.txt
index c3599220..f735c57a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,5 +34,3 @@ mecab-python3==1.0.3
 unidic-lite==1.0.8
 # gruut+supported langs
 gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
-# others
-webrtcvad # for VAD

From 0ae1e0248c74f3dc820798619c2b6f6537bfb339 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Tue, 22 Mar 2022 14:53:33 -0300
Subject: [PATCH 25/38] Fix the bug for emptly audio files

---
 TTS/utils/vad.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py
index 88790202..7384934a 100644
--- a/TTS/utils/vad.py
+++ b/TTS/utils/vad.py
@@ -18,7 +18,7 @@ def resample_wav(wav, sr, new_sr):
 def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False):
     factor = new_sr / vad_sr
     new_timestamps = []
-    if just_begging_end:
+    if just_begging_end and timestamps:
         # get just the start and end timestamps
         new_dict = {'start': int(timestamps[0]['start']*factor), 'end': int(timestamps[-1]['end']*factor)}
         new_timestamps.append(new_dict)
@@ -64,8 +64,12 @@ def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000,
     # map the current speech_timestamps to the sample rate of the ground truth audio
     new_speech_timestamps = map_timestamps_to_new_sr(vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end)
 
-    # save audio
-    save_audio(out_path,
-            collect_chunks(new_speech_timestamps, wav), sampling_rate=gt_sample_rate)
+    # if have speech timestamps else save the wav
+    if new_speech_timestamps:
+        wav = collect_chunks(new_speech_timestamps, wav)
+    else:
+        print(f"> The file {audio_path} probably does not have speech please check it !!")
 
+    # save audio
+    save_audio(out_path, wav, sampling_rate=gt_sample_rate)
     return out_path

From 3435bc8fcad433438751e14b75dd8f5e0c36ae41 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Wed, 23 Mar 2022 15:05:32 -0300
Subject: [PATCH 26/38] Fix style tests

---
 TTS/bin/remove_silence_using_vad.py |  9 ++++++++-
 TTS/utils/vad.py                    | 22 ++++++++++++++--------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
index a8a60bf8..7d88ae91 100755
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@@ -4,6 +4,7 @@ import os
 import pathlib
 
 from tqdm import tqdm
+
 from TTS.utils.vad import get_vad_model_and_utils, remove_silence
 
 
@@ -16,7 +17,13 @@ def adjust_path_and_remove_silence(audio_path):
     # create all directory structure
     pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
     # remove the silence and save the audio
-    output_path = remove_silence(model_and_utils, audio_path, output_path, trim_just_beginning_and_end=args.trim_just_beginning_and_end, use_cuda=args.use_cuda)
+    output_path = remove_silence(
+        model_and_utils,
+        audio_path,
+        output_path,
+        trim_just_beginning_and_end=args.trim_just_beginning_and_end,
+        use_cuda=args.use_cuda,
+    )
 
     return output_path
 
diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py
index 7384934a..033b911a 100644
--- a/TTS/utils/vad.py
+++ b/TTS/utils/vad.py
@@ -1,6 +1,7 @@
 import torch
 import torchaudio
 
+
 def read_audio(path):
     wav, sr = torchaudio.load(path)
 
@@ -9,39 +10,42 @@ def read_audio(path):
 
     return wav.squeeze(0), sr
 
+
 def resample_wav(wav, sr, new_sr):
     wav = wav.unsqueeze(0)
     transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr)
     wav = transform(wav)
     return wav.squeeze(0)
 
+
 def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False):
     factor = new_sr / vad_sr
     new_timestamps = []
     if just_begging_end and timestamps:
         # get just the start and end timestamps
-        new_dict = {'start': int(timestamps[0]['start']*factor), 'end': int(timestamps[-1]['end']*factor)}
+        new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)}
         new_timestamps.append(new_dict)
     else:
         for ts in timestamps:
             # map to the new SR
-            new_dict = {'start': int(ts['start']*factor), 'end': int(ts['end']*factor)}
+            new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)}
             new_timestamps.append(new_dict)
 
     return new_timestamps
 
+
 def get_vad_model_and_utils(use_cuda=False):
-    model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
-                                model='silero_vad',
-                                force_reload=True,
-                                onnx=False)
+    model, utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=False)
     if use_cuda:
         model = model.cuda()
 
     get_speech_timestamps, save_audio, _, _, collect_chunks = utils
     return model, get_speech_timestamps, save_audio, collect_chunks
 
-def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False):
+
+def remove_silence(
+    model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False
+):
 
     # get the VAD model and utils functions
     model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils
@@ -62,7 +66,9 @@ def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000,
     speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768)
 
     # map the current speech_timestamps to the sample rate of the ground truth audio
-    new_speech_timestamps = map_timestamps_to_new_sr(vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end)
+    new_speech_timestamps = map_timestamps_to_new_sr(
+        vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end
+    )
 
     # if have speech timestamps else save the wav
     if new_speech_timestamps:

From 37896e17430a5627b4b3224603b9101f3259a446 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Thu, 24 Mar 2022 14:16:04 -0300
Subject: [PATCH 27/38] Bug fix in freeze encoder (#1391)

* Fix the bug in freeze encoder

* Remove emb_l definition for non-multilingual training

* Fix unit tests
---
 TTS/tts/models/vits.py       | 1 -
 tests/tts_tests/test_vits.py | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index afadbadd..87d559fc 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -706,7 +706,6 @@ class Vits(BaseTTS):
             torch.nn.init.xavier_uniform_(self.emb_l.weight)
         else:
             self.embedded_language_dim = 0
-            self.emb_l = None
 
     def get_aux_input(self, aux_input: Dict):
         sid, g, lid = self._set_cond_input(aux_input)
diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
index 81d2ebbd..05adb9ed 100644
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@@ -79,25 +79,25 @@ class TestVits(unittest.TestCase):
         model = Vits(args)
         self.assertEqual(model.language_manager, None)
         self.assertEqual(model.embedded_language_dim, 0)
-        self.assertEqual(model.emb_l, None)
+        assertHasNotAttr(self, model, "emb_l")
 
         args = VitsArgs(language_ids_file=LANG_FILE)
         model = Vits(args)
         self.assertNotEqual(model.language_manager, None)
         self.assertEqual(model.embedded_language_dim, 0)
-        self.assertEqual(model.emb_l, None)
+        assertHasNotAttr(self, model, "emb_l")
 
         args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True)
         model = Vits(args)
         self.assertNotEqual(model.language_manager, None)
         self.assertEqual(model.embedded_language_dim, args.embedded_language_dim)
-        self.assertNotEqual(model.emb_l, None)
+        assertHasAttr(self, model, "emb_l")
 
         args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, embedded_language_dim=102)
         model = Vits(args)
         self.assertNotEqual(model.language_manager, None)
         self.assertEqual(model.embedded_language_dim, args.embedded_language_dim)
-        self.assertNotEqual(model.emb_l, None)
+        assertHasAttr(self, model, "emb_l")
 
     def test_get_aux_input(self):
         aux_input = {"speaker_ids": None, "style_wav": None, "d_vectors": None, "language_ids": None}

From c66a6241fd761ea07379849474d576f75b9c4e84 Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Fri, 25 Mar 2022 23:15:33 +0100
Subject: [PATCH 28/38] Enforce phonemizer definition for synthesis (#1441)

* Enforce phonemizer definition for synthesis

* Fix train_tts, tokenizer init can now edit config

* Add small change to trigger CI pipeline

* fix wrong output path for one tts_test

* Fix style

* Test config overides by args and tokenizer

* Fix style
---
 TTS/bin/train_tts.py                          |  2 +-
 TTS/tts/utils/text/tokenizer.py               |  1 +
 TTS/utils/synthesizer.py                      |  3 +
 requirements.txt                              |  2 +-
 tests/tts_tests/test_align_tts_train.py       | 11 +++-
 .../test_fast_pitch_speaker_emb_train.py      |  9 +++
 tests/tts_tests/test_fast_pitch_train.py      |  9 +++
 .../test_glow_tts_d-vectors_train.py          |  9 +++
 .../test_glow_tts_speaker_emb_train.py        |  9 +++
 tests/tts_tests/test_glow_tts_train.py        |  9 +++
 tests/tts_tests/test_speedy_speech_train.py   |  9 +++
 .../test_tacotron2_d-vectors_train.py         |  9 +++
 .../test_tacotron2_speaker_emb_train.py       |  9 +++
 tests/tts_tests/test_tacotron2_train.py       |  9 +++
 .../test_tacotron2_train_fsspec_path.py       | 55 -------------------
 ...est_vits_multilingual_speaker_emb_train.py |  9 +++
 .../test_vits_multilingual_train-d_vectors.py |  9 +++
 .../tts_tests/test_vits_speaker_emb_train.py  |  9 +++
 tests/tts_tests/test_vits_train.py            |  9 +++
 19 files changed, 133 insertions(+), 58 deletions(-)
 delete mode 100644 tests/tts_tests/test_tacotron2_train_fsspec_path.py

diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
index 976b74af..bdb4f6f6 100644
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tts.py
@@ -57,7 +57,7 @@ def main():
     # init the trainer and 🚀
     trainer = Trainer(
         train_args,
-        config,
+        model.config,
         config.output_path,
         model=model,
         train_samples=train_samples,
diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
index f0d85a44..1569c634 100644
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@@ -191,6 +191,7 @@ class TTSTokenizer:
                     phonemizer = get_phonemizer_by_name(
                         DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
                     )
+                    new_config.phonemizer = phonemizer.name()
                 except KeyError as e:
                     raise ValueError(
                         f"""No phonemizer found for language {config.phoneme_language}.
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 2ea23adb..3dd8be44 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -112,6 +112,9 @@ class Synthesizer(object):
         self.use_phonemes = self.tts_config.use_phonemes
         self.tts_model = setup_tts_model(config=self.tts_config)
 
+        if self.use_phonemes and self.tts_config["phonemizer"] is None:
+            raise ValueError("Phonemizer is not defined in the TTS config.")
+
         if not self.encoder_checkpoint:
             self._set_speaker_encoder_paths_from_tts_config()
 
diff --git a/requirements.txt b/requirements.txt
index f735c57a..db47c2cc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,7 +25,7 @@ tensorboardX
 pyworld
 # coqui stack
 coqui-trainer
-coqpit                                          # config managemenr
+coqpit # config management
 # chinese g2p deps
 jieba
 pypinyin
diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py
index 85dfbbcb..75c5643c 100644
--- a/tests/tts_tests/test_align_tts_train.py
+++ b/tests/tts_tests/test_align_tts_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -42,7 +43,7 @@ command_train = (
     "--coqpit.datasets.0.meta_file_train metadata.csv "
     "--coqpit.datasets.0.meta_file_val metadata.csv "
     "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs -1"
+    "--coqpit.test_delay_epochs 0 "
 )
 run_cli(command_train)
 
@@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json")
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py
index 37faf449..9553d745 100644
--- a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py
+++ b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -74,6 +75,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 speaker_id = "ljspeech-1"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_fast_pitch_train.py b/tests/tts_tests/test_fast_pitch_train.py
index d2d78af4..134cd4ba 100644
--- a/tests/tts_tests/test_fast_pitch_train.py
+++ b/tests/tts_tests/test_fast_pitch_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -73,6 +74,14 @@ continue_config_path = os.path.join(continue_path, "config.json")
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_glow_tts_d-vectors_train.py b/tests/tts_tests/test_glow_tts_d-vectors_train.py
index 14f9e4d2..3a9c8fcc 100644
--- a/tests/tts_tests/test_glow_tts_d-vectors_train.py
+++ b/tests/tts_tests/test_glow_tts_d-vectors_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -61,6 +62,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 speaker_id = "ljspeech-1"
 continue_speakers_path = config.d_vector_file
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_glow_tts_speaker_emb_train.py b/tests/tts_tests/test_glow_tts_speaker_emb_train.py
index c327332e..322b506e 100644
--- a/tests/tts_tests/test_glow_tts_speaker_emb_train.py
+++ b/tests/tts_tests/test_glow_tts_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -58,6 +59,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 speaker_id = "ljspeech-1"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py
index b0acf004..cf9a04f4 100644
--- a/tests/tts_tests/test_glow_tts_train.py
+++ b/tests/tts_tests/test_glow_tts_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -55,6 +56,14 @@ continue_config_path = os.path.join(continue_path, "config.json")
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py
index 9a26d253..c4adcee3 100644
--- a/tests/tts_tests/test_speedy_speech_train.py
+++ b/tests/tts_tests/test_speedy_speech_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json")
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py
index 6b003f2c..0d02fa98 100644
--- a/tests/tts_tests/test_tacotron2_d-vectors_train.py
+++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -61,6 +62,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 speaker_id = "ljspeech-1"
 continue_speakers_path = config.d_vector_file
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py
index b9f4de0b..2e812d90 100644
--- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py
+++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -59,6 +60,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 speaker_id = "ljspeech-1"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py
index 8c30d9f9..d1941022 100644
--- a/tests/tts_tests/test_tacotron2_train.py
+++ b/tests/tts_tests/test_tacotron2_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json")
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_tacotron2_train_fsspec_path.py b/tests/tts_tests/test_tacotron2_train_fsspec_path.py
deleted file mode 100644
index 5d14a983..00000000
--- a/tests/tts_tests/test_tacotron2_train_fsspec_path.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import glob
-import os
-import shutil
-
-from tests import get_device_id, get_tests_output_path, run_cli
-from TTS.tts.configs.tacotron2_config import Tacotron2Config
-
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
-output_path = os.path.join(get_tests_output_path(), "train_outputs")
-
-config = Tacotron2Config(
-    r=5,
-    batch_size=8,
-    eval_batch_size=8,
-    num_loader_workers=0,
-    num_eval_loader_workers=0,
-    text_cleaner="english_cleaners",
-    use_phonemes=False,
-    phoneme_language="en-us",
-    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
-    run_eval=True,
-    test_delay_epochs=-1,
-    epochs=1,
-    print_step=1,
-    test_sentences=[
-        "Be a voice, not an echo.",
-    ],
-    print_eval=True,
-    max_decoder_steps=50,
-)
-config.audio.do_trim_silence = True
-config.audio.trim_db = 60
-config.save_json(config_path)
-
-# train the model for one epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path file://{config_path} "
-    f"--coqpit.output_path file://{output_path} "
-    "--coqpit.datasets.0.name ljspeech "
-    "--coqpit.datasets.0.meta_file_train metadata.csv "
-    "--coqpit.datasets.0.meta_file_val metadata.csv "
-    "--coqpit.datasets.0.path tests/data/ljspeech "
-    "--coqpit.test_delay_epochs 0 "
-)
-run_cli(command_train)
-
-# Find latest folder
-continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
-
-# restore the model and continue training for one more epoch
-command_train = (
-    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path file://{continue_path} "
-)
-run_cli(command_train)
-shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
index 0c7672d7..683bb0a7 100644
--- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
+++ b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -92,6 +93,14 @@ languae_id = "en"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 continue_languages_path = os.path.join(continue_path, "language_ids.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
index e12661a5..e4a82cdd 100644
--- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
+++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -99,6 +100,14 @@ languae_id = "en"
 continue_speakers_path = config.d_vector_file
 continue_languages_path = os.path.join(continue_path, "language_ids.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py
index c928cee4..48597241 100644
--- a/tests/tts_tests/test_vits_speaker_emb_train.py
+++ b/tests/tts_tests/test_vits_speaker_emb_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -65,6 +66,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 speaker_id = "ljspeech-1"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py
index 003f99a8..64ff63f3 100644
--- a/tests/tts_tests/test_vits_train.py
+++ b/tests/tts_tests/test_vits_train.py
@@ -1,4 +1,5 @@
 import glob
+import json
 import os
 import shutil
 
@@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json")
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 

From 1b22f03e986134bcbcd2aba72fe8e226e07f5b9f Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Wed, 30 Mar 2022 12:47:11 +0200
Subject: [PATCH 29/38] Fix G2P backend of the released models (#1461)

* Fix enforce phonemizer

* Add new models

* Fix .model.json
---
 TTS/.models.json         | 76 ++++++++++++++++++++--------------------
 TTS/utils/synthesizer.py |  7 ++--
 2 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/TTS/.models.json b/TTS/.models.json
index 801b8468..24838a5d 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -4,7 +4,7 @@
             "multi-dataset":{
                 "your_tts":{
                     "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--multilingual--multi-dataset--your_tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
                     "default_vocoder": null,
                     "commit": "e9a1953e",
                     "license": "CC BY-NC-ND 4.0",
@@ -16,7 +16,7 @@
             "ek1": {
                 "tacotron2": {
                     "description": "EK1 en-rp tacotron2 by NMStoker",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--ek1--tacotron2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
                     "default_vocoder": "vocoder_models/en/ek1/wavegrad",
                     "commit": "c802255"
                 }
@@ -24,7 +24,7 @@
             "ljspeech": {
                 "tacotron2-DDC": {
                     "description": "Tacotron2 with Double Decoder Consistency.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/tts_models--en--ljspeech--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
                     "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                     "commit": "bae2ad0f",
                     "author": "Eren Gölge @erogol",
@@ -33,7 +33,7 @@
                 },
                 "tacotron2-DDC_ph": {
                     "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
                     "default_vocoder": "vocoder_models/en/ljspeech/univnet",
                     "commit": "3900448",
                     "author": "Eren Gölge @erogol",
@@ -42,7 +42,7 @@
                 },
                 "glow-tts": {
                     "description": "",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
                     "stats_file": null,
                     "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
                     "commit": "",
@@ -52,7 +52,7 @@
                 },
                 "speedy-speech": {
                     "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
                     "stats_file": null,
                     "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                     "commit": "4581e3d",
@@ -62,7 +62,7 @@
                 },
                 "tacotron2-DCA": {
                     "description": "",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
                     "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
                     "commit": "",
                     "author": "Eren Gölge @erogol",
@@ -71,7 +71,7 @@
                 },
                 "vits": {
                     "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
                     "default_vocoder": null,
                     "commit": "3900448",
                     "author": "Eren Gölge @erogol",
@@ -80,7 +80,7 @@
                 },
                 "fast_pitch": {
                     "description": "FastPitch model trained on LJSpeech using the Aligner Network",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.2/tts_models--en--ljspeech--fast_pitch.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
                     "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                     "commit": "b27b3ba",
                     "author": "Eren Gölge @erogol",
@@ -91,7 +91,7 @@
             "vctk": {
                 "vits": {
                     "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
                     "default_vocoder": null,
                     "commit": "3900448",
                     "author": "Eren @erogol",
@@ -100,7 +100,7 @@
                 },
                 "fast_pitch":{
                     "description": "FastPitch model trained on VCTK dataseset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
                     "default_vocoder": null,
                     "commit": "bdab788d",
                     "author": "Eren @erogol",
@@ -111,7 +111,7 @@
             "sam": {
                 "tacotron-DDC": {
                     "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/tts_models--en--sam--tacotron_DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
                     "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
                     "commit": "bae2ad0f",
                     "author": "Eren Gölge @erogol",
@@ -123,7 +123,7 @@
         "es": {
             "mai": {
                 "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
                     "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
                     "commit": "",
                     "author": "Eren Gölge @erogol",
@@ -135,7 +135,7 @@
         "fr": {
             "mai": {
                 "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
                     "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
                     "commit": "",
                     "author": "Eren Gölge @erogol",
@@ -147,7 +147,7 @@
         "uk":{
             "mai": {
                 "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
                     "author":"@robinhad",
                     "commit": "bdab788d",
                     "license": "MIT",
@@ -159,7 +159,7 @@
         "zh-CN": {
             "baker": {
                 "tacotron2-DDC-GST": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
                     "commit": "unknown",
                     "author": "@kirianguiller",
                     "default_vocoder": null
@@ -169,7 +169,7 @@
         "nl": {
             "mai": {
                 "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--nl--mai--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
                     "author": "@r-dh",
                     "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
                     "stats_file": null,
@@ -180,7 +180,7 @@
         "de": {
             "thorsten": {
                 "tacotron2-DCA": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
                     "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
                     "author": "@thorstenMueller",
                     "commit": "unknown"
@@ -190,7 +190,7 @@
         "ja": {
             "kokoro": {
                 "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
                     "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
                     "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
                     "author": "@kaiidams",
@@ -201,7 +201,7 @@
         "tr":{
             "common-voice": {
                 "glow-tts":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--tr--common-voice--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
                     "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
                     "license": "MIT",
                     "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
@@ -213,14 +213,14 @@
         "it": {
             "mai_female": {
                 "glow-tts":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
                     "commit": null
                 },
                 "vits":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
@@ -229,14 +229,14 @@
             },
             "mai_male": {
                 "glow-tts":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
                     "commit": null
                 },
                 "vits":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
@@ -249,14 +249,14 @@
         "universal": {
             "libri-tts": {
                 "wavegrad": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
                     "commit": "ea976b0",
                     "author": "Eren Gölge @erogol",
                     "license": "MPL",
                     "contact": "egolge@coqui.com"
                 },
                 "fullband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
                     "commit": "4132240",
                     "author": "Eren Gölge @erogol",
                     "license": "MPL",
@@ -268,13 +268,13 @@
             "ek1": {
                 "wavegrad": {
                     "description": "EK1 en-rp wavegrad by NMStoker",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--en--ek1--wavegrad.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
                     "commit": "c802255"
                 }
             },
             "ljspeech": {
                 "multiband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
                     "commit": "ea976b0",
                     "author": "Eren Gölge @erogol",
                     "license": "MPL",
@@ -282,7 +282,7 @@
                 },
                 "hifigan_v2": {
                     "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--ljspeech-hifigan_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
                     "commit": "bae2ad0f",
                     "author": "@erogol",
                     "license": "",
@@ -290,7 +290,7 @@
                 },
                 "univnet": {
                     "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
                     "commit": "4581e3d",
                     "author": "Eren @erogol",
                     "license": "TBD",
@@ -300,7 +300,7 @@
             "vctk": {
                 "hifigan_v2": {
                     "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
                     "commit": "2f07160",
                     "author": "Edresson Casanova",
                     "license": "",
@@ -310,7 +310,7 @@
             "sam": {
                 "hifigan_v2": {
                     "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
                     "commit": "2f07160",
                     "author": "Eren Gölge @erogol",
                     "license": "",
@@ -321,7 +321,7 @@
         "nl": {
             "mai": {
                 "parallel-wavegan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--nl--mai--parallel-wavegan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
                     "author": "@r-dh",
                     "commit": "unknown"
                 }
@@ -330,12 +330,12 @@
         "de": {
             "thorsten": {
                 "wavegrad": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
                     "author": "@thorstenMueller",
                     "commit": "unknown"
                 },
                 "fullband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
                     "author": "@thorstenMueller",
                     "commit": "unknown"
                 }
@@ -344,7 +344,7 @@
         "ja": {
             "kokoro": {
                 "hifigan_v1": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/vocoder_models--ja--kokoro--hifigan_v1.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
                     "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
                     "author": "@kaiidams",
                     "commit": "3900448"
@@ -354,7 +354,7 @@
         "uk": {
             "mai": {
                 "multiband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/vocoder_models--uk--mai--multiband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
                     "author":"@robinhad",
                     "commit": "bdab788d",
                     "license": "MIT",
@@ -365,7 +365,7 @@
         "tr":{
             "common-voice": {
                 "hifigan":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/vocoder_models--tr--common-voice--hifigan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
                     "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
                     "author": "Fatih Akademi",
                     "license": "MIT",
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 3dd8be44..eef4086c 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -109,12 +109,11 @@ class Synthesizer(object):
         """
         # pylint: disable=global-statement
         self.tts_config = load_config(tts_config_path)
-        self.use_phonemes = self.tts_config.use_phonemes
-        self.tts_model = setup_tts_model(config=self.tts_config)
-
-        if self.use_phonemes and self.tts_config["phonemizer"] is None:
+        if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None:
             raise ValueError("Phonemizer is not defined in the TTS config.")
 
+        self.tts_model = setup_tts_model(config=self.tts_config)
+
         if not self.encoder_checkpoint:
             self._set_speaker_encoder_paths_from_tts_config()
 

From 060e0f9368eb6237cf330502b9869b4e87de6c12 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Thu, 31 Mar 2022 08:41:16 -0300
Subject: [PATCH 30/38] Add EmbeddingManager and BaseIDManager (#1374)

---
 TTS/bin/compute_embeddings.py                 |   6 +-
 TTS/bin/eval_encoder.py                       |  10 +-
 TTS/bin/extract_tts_spectrograms.py           |   4 +-
 TTS/bin/synthesize.py                         |   4 +-
 TTS/bin/train_encoder.py                      |   4 +-
 TTS/encoder/utils/generic_utils.py            |   2 +-
 TTS/server/server.py                          |   2 +-
 TTS/tts/models/base_tts.py                    |  32 +-
 TTS/tts/models/glow_tts.py                    |   2 +-
 TTS/tts/models/vits.py                        |  44 ++-
 TTS/tts/utils/languages.py                    |  45 +--
 TTS/tts/utils/managers.py                     | 285 ++++++++++++++++
 TTS/tts/utils/speakers.py                     | 308 ++----------------
 TTS/utils/synthesizer.py                      |  18 +-
 .../multilingual/vits_tts/train_vits_tts.py   |   2 +-
 recipes/vctk/fast_pitch/train_fast_pitch.py   |   2 +-
 recipes/vctk/fast_speech/train_fast_speech.py |   2 +-
 recipes/vctk/glow_tts/train_glow_tts.py       |   2 +-
 .../vctk/speedy_speech/train_speedy_speech.py |   2 +-
 .../vctk/tacotron-DDC/train_tacotron-DDC.py   |   2 +-
 .../vctk/tacotron2-DDC/train_tacotron2-ddc.py |   2 +-
 recipes/vctk/tacotron2/train_tacotron2.py     |   2 +-
 recipes/vctk/vits/train_vits.py               |   2 +-
 tests/aux_tests/test_speaker_manager.py       |  22 +-
 tests/tts_tests/test_glow_tts.py              |   2 +-
 tests/tts_tests/test_vits.py                  |   6 +-
 tests/zoo_tests/test_models.py                |   2 +-
 27 files changed, 412 insertions(+), 404 deletions(-)
 create mode 100644 TTS/tts/utils/managers.py

diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index d7a2c5f6..b62d603a 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -49,7 +49,7 @@ encoder_manager = SpeakerManager(
     use_cuda=args.use_cuda,
 )
 
-class_name_key = encoder_manager.speaker_encoder_config.class_name_key
+class_name_key = encoder_manager.encoder_config.class_name_key
 
 # compute speaker embeddings
 speaker_mapping = {}
@@ -63,10 +63,10 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
     wav_file_name = os.path.basename(wav_file)
     if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
         # get the embedding from the old file
-        embedd = encoder_manager.get_d_vector_by_clip(wav_file_name)
+        embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
     else:
         # extract the embedding
-        embedd = encoder_manager.compute_d_vector_from_clip(wav_file)
+        embedd = encoder_manager.compute_embedding_from_clip(wav_file)
 
     # create speaker_mapping if target dataset is defined
     speaker_mapping[wav_file_name] = {}
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
index 089f3645..7f9fdf93 100644
--- a/TTS/bin/eval_encoder.py
+++ b/TTS/bin/eval_encoder.py
@@ -11,8 +11,8 @@ from TTS.tts.utils.speakers import SpeakerManager
 
 def compute_encoder_accuracy(dataset_items, encoder_manager):
 
-    class_name_key = encoder_manager.speaker_encoder_config.class_name_key
-    map_classid_to_classname = getattr(encoder_manager.speaker_encoder_config, "map_classid_to_classname", None)
+    class_name_key = encoder_manager.encoder_config.class_name_key
+    map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
 
     class_acc_dict = {}
 
@@ -22,13 +22,13 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
         wav_file = item["audio_file"]
 
         # extract the embedding
-        embedd = encoder_manager.compute_d_vector_from_clip(wav_file)
-        if encoder_manager.speaker_encoder_criterion is not None and map_classid_to_classname is not None:
+        embedd = encoder_manager.compute_embedding_from_clip(wav_file)
+        if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
             embedding = torch.FloatTensor(embedd).unsqueeze(0)
             if encoder_manager.use_cuda:
                 embedding = embedding.cuda()
 
-            class_id = encoder_manager.speaker_encoder_criterion.softmax.inference(embedding).item()
+            class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
             predicted_label = map_classid_to_classname[str(class_id)]
         else:
             predicted_label = None
diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
index fa63c46a..a0dd0549 100755
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -37,8 +37,8 @@ def setup_loader(ap, r, verbose=False):
         precompute_num_workers=0,
         use_noise_augment=False,
         verbose=verbose,
-        speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None,
-        d_vector_mapping=speaker_manager.d_vectors if c.use_d_vector_file else None,
+        speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
+        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
     )
 
     if c.use_phonemes and c.compute_input_seq_cache:
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index eb166bc8..6247b2a4 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -278,7 +278,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
         print(
             " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
         )
-        print(synthesizer.tts_model.speaker_manager.speaker_ids)
+        print(synthesizer.tts_model.speaker_manager.ids)
         return
 
     # query langauge ids of a multi-lingual model.
@@ -286,7 +286,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
         print(
             " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
         )
-        print(synthesizer.tts_model.language_manager.language_id_mapping)
+        print(synthesizer.tts_model.language_manager.ids)
         return
 
     # check the arguments against a multi-speaker model.
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index b8d38bac..d28f188e 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -12,7 +12,7 @@ from trainer.torch import NoamLR
 from trainer.trainer_utils import get_optimizer
 
 from TTS.encoder.dataset import EncoderDataset
-from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_speaker_encoder_model
+from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
 from TTS.encoder.utils.samplers import PerfectBatchSampler
 from TTS.encoder.utils.training import init_training
 from TTS.encoder.utils.visual import plot_embeddings
@@ -258,7 +258,7 @@ def main(args):  # pylint: disable=redefined-outer-name
     global train_classes
 
     ap = AudioProcessor(**c.audio)
-    model = setup_speaker_encoder_model(c)
+    model = setup_encoder_model(c)
 
     optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
 
diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
index 19c00582..91a896f6 100644
--- a/TTS/encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@@ -125,7 +125,7 @@ def to_camel(text):
     return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
 
 
-def setup_speaker_encoder_model(config: "Coqpit"):
+def setup_encoder_model(config: "Coqpit"):
     if config.model_params["model_name"].lower() == "lstm":
         model = LSTMSpeakerEncoder(
             config.model_params["input_dim"],
diff --git a/TTS/server/server.py b/TTS/server/server.py
index aef507fd..fd53e76d 100644
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@@ -143,7 +143,7 @@ def index():
         "index.html",
         show_details=args.show_details,
         use_multi_speaker=use_multi_speaker,
-        speaker_ids=speaker_manager.speaker_ids if speaker_manager is not None else None,
+        speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
         use_gst=use_gst,
     )
 
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 945c031f..652b77dd 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -136,18 +136,18 @@ class BaseTTS(BaseTrainerModel):
         if hasattr(self, "speaker_manager"):
             if config.use_d_vector_file:
                 if speaker_name is None:
-                    d_vector = self.speaker_manager.get_random_d_vector()
+                    d_vector = self.speaker_manager.get_random_embeddings()
                 else:
-                    d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name)
+                    d_vector = self.speaker_manager.get_d_vector_by_name(speaker_name)
             elif config.use_speaker_embedding:
                 if speaker_name is None:
-                    speaker_id = self.speaker_manager.get_random_speaker_id()
+                    speaker_id = self.speaker_manager.get_random_id()
                 else:
-                    speaker_id = self.speaker_manager.speaker_ids[speaker_name]
+                    speaker_id = self.speaker_manager.ids[speaker_name]
 
         # get language id
         if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
-            language_id = self.language_manager.language_id_mapping[language_name]
+            language_id = self.language_manager.ids[language_name]
 
         return {
             "text": text,
@@ -279,23 +279,19 @@ class BaseTTS(BaseTrainerModel):
             # setup multi-speaker attributes
             if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
                 if hasattr(config, "model_args"):
-                    speaker_id_mapping = (
-                        self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None
-                    )
-                    d_vector_mapping = self.speaker_manager.d_vectors if config.model_args.use_d_vector_file else None
+                    speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
+                    d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
                     config.use_d_vector_file = config.model_args.use_d_vector_file
                 else:
-                    speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None
-                    d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None
+                    speaker_id_mapping = self.speaker_manager.ids if config.use_speaker_embedding else None
+                    d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
             else:
                 speaker_id_mapping = None
                 d_vector_mapping = None
 
             # setup multi-lingual attributes
             if hasattr(self, "language_manager") and self.language_manager is not None:
-                language_id_mapping = (
-                    self.language_manager.language_id_mapping if self.args.use_language_embedding else None
-                )
+                language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None
             else:
                 language_id_mapping = None
 
@@ -352,13 +348,13 @@ class BaseTTS(BaseTrainerModel):
 
         d_vector = None
         if self.config.use_d_vector_file:
-            d_vector = [self.speaker_manager.d_vectors[name]["embedding"] for name in self.speaker_manager.d_vectors]
+            d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings]
             d_vector = (random.sample(sorted(d_vector), 1),)
 
         aux_inputs = {
             "speaker_id": None
             if not self.config.use_speaker_embedding
-            else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1),
+            else random.sample(sorted(self.speaker_manager.ids.values()), 1),
             "d_vector": d_vector,
             "style_wav": None,  # TODO: handle GST style input
         }
@@ -405,7 +401,7 @@ class BaseTTS(BaseTrainerModel):
         """Save the speaker.json and language_ids.json at the beginning of the training. Also update both paths."""
         if self.speaker_manager is not None:
             output_path = os.path.join(trainer.output_path, "speakers.json")
-            self.speaker_manager.save_speaker_ids_to_file(output_path)
+            self.speaker_manager.save_ids_to_file(output_path)
             trainer.config.speakers_file = output_path
             # some models don't have `model_args` set
             if hasattr(trainer.config, "model_args"):
@@ -416,7 +412,7 @@ class BaseTTS(BaseTrainerModel):
 
         if hasattr(self, "language_manager") and self.language_manager is not None:
             output_path = os.path.join(trainer.output_path, "language_ids.json")
-            self.language_manager.save_language_ids_to_file(output_path)
+            self.language_manager.save_ids_to_file(output_path)
             trainer.config.language_ids_file = output_path
             if hasattr(trainer.config, "model_args"):
                 trainer.config.model_args.language_ids_file = output_path
diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
index fea570a6..7c0f95e1 100644
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@@ -124,7 +124,7 @@ class GlowTTS(BaseTTS):
             )
             if self.speaker_manager is not None:
                 assert (
-                    config.d_vector_dim == self.speaker_manager.d_vector_dim
+                    config.d_vector_dim == self.speaker_manager.embedding_dim
                 ), " [!] d-vector dimension mismatch b/w config and speaker manager."
         # init speaker embedding layer
         if config.use_speaker_embedding and not config.use_d_vector_file:
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 87d559fc..943b9eae 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -652,28 +652,28 @@ class Vits(BaseTTS):
 
         # TODO: make this a function
         if self.args.use_speaker_encoder_as_loss:
-            if self.speaker_manager.speaker_encoder is None and (
+            if self.speaker_manager.encoder is None and (
                 not self.args.speaker_encoder_model_path or not self.args.speaker_encoder_config_path
             ):
                 raise RuntimeError(
                     " [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!"
                 )
 
-            self.speaker_manager.speaker_encoder.eval()
+            self.speaker_manager.encoder.eval()
             print(" > External Speaker Encoder Loaded !!")
 
             if (
-                hasattr(self.speaker_manager.speaker_encoder, "audio_config")
-                and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"]
+                hasattr(self.speaker_manager.encoder, "audio_config")
+                and self.config.audio["sample_rate"] != self.speaker_manager.encoder.audio_config["sample_rate"]
             ):
                 self.audio_transform = torchaudio.transforms.Resample(
                     orig_freq=self.audio_config["sample_rate"],
-                    new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"],
+                    new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
                 )
             # pylint: disable=W0101,W0105
             self.audio_transform = torchaudio.transforms.Resample(
                 orig_freq=self.config.audio.sample_rate,
-                new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"],
+                new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
             )
 
     def _init_speaker_embedding(self):
@@ -887,7 +887,7 @@ class Vits(BaseTTS):
             pad_short=True,
         )
 
-        if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None:
+        if self.args.use_speaker_encoder_as_loss and self.speaker_manager.encoder is not None:
             # concate generated and GT waveforms
             wavs_batch = torch.cat((wav_seg, o), dim=0)
 
@@ -896,7 +896,7 @@ class Vits(BaseTTS):
             if self.audio_transform is not None:
                 wavs_batch = self.audio_transform(wavs_batch)
 
-            pred_embs = self.speaker_manager.speaker_encoder.forward(wavs_batch, l2_norm=True)
+            pred_embs = self.speaker_manager.encoder.forward(wavs_batch, l2_norm=True)
 
             # split generated and GT speaker embeddings
             gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0)
@@ -1223,18 +1223,18 @@ class Vits(BaseTTS):
         if hasattr(self, "speaker_manager"):
             if config.use_d_vector_file:
                 if speaker_name is None:
-                    d_vector = self.speaker_manager.get_random_d_vector()
+                    d_vector = self.speaker_manager.get_random_embeddings()
                 else:
-                    d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=None, randomize=False)
+                    d_vector = self.speaker_manager.get_mean_embedding(speaker_name, num_samples=None, randomize=False)
             elif config.use_speaker_embedding:
                 if speaker_name is None:
-                    speaker_id = self.speaker_manager.get_random_speaker_id()
+                    speaker_id = self.speaker_manager.get_random_id()
                 else:
-                    speaker_id = self.speaker_manager.speaker_ids[speaker_name]
+                    speaker_id = self.speaker_manager.ids[speaker_name]
 
         # get language id
         if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
-            language_id = self.language_manager.language_id_mapping[language_name]
+            language_id = self.language_manager.ids[language_name]
 
         return {
             "text": text,
@@ -1289,26 +1289,22 @@ class Vits(BaseTTS):
         d_vectors = None
 
         # get numerical speaker ids from speaker names
-        if self.speaker_manager is not None and self.speaker_manager.speaker_ids and self.args.use_speaker_embedding:
-            speaker_ids = [self.speaker_manager.speaker_ids[sn] for sn in batch["speaker_names"]]
+        if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding:
+            speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]
 
         if speaker_ids is not None:
             speaker_ids = torch.LongTensor(speaker_ids)
             batch["speaker_ids"] = speaker_ids
 
         # get d_vectors from audio file names
-        if self.speaker_manager is not None and self.speaker_manager.d_vectors and self.args.use_d_vector_file:
-            d_vector_mapping = self.speaker_manager.d_vectors
+        if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file:
+            d_vector_mapping = self.speaker_manager.embeddings
             d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_files"]]
             d_vectors = torch.FloatTensor(d_vectors)
 
         # get language ids from language names
-        if (
-            self.language_manager is not None
-            and self.language_manager.language_id_mapping
-            and self.args.use_language_embedding
-        ):
-            language_ids = [self.language_manager.language_id_mapping[ln] for ln in batch["language_names"]]
+        if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
+            language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]
 
         if language_ids is not None:
             language_ids = torch.LongTensor(language_ids)
@@ -1490,7 +1486,7 @@ class Vits(BaseTTS):
         language_manager = LanguageManager.init_from_config(config)
 
         if config.model_args.speaker_encoder_model_path:
-            speaker_manager.init_speaker_encoder(
+            speaker_manager.init_encoder(
                 config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path
             )
         return Vits(new_config, ap, tokenizer, speaker_manager, language_manager)
diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py
index 7decabb0..9b5e2007 100644
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@@ -1,6 +1,5 @@
-import json
 import os
-from typing import Dict, List
+from typing import Any, Dict, List
 
 import fsspec
 import numpy as np
@@ -8,9 +7,10 @@ import torch
 from coqpit import Coqpit
 
 from TTS.config import check_config_and_model_args
+from TTS.tts.utils.managers import BaseIDManager
 
 
-class LanguageManager:
+class LanguageManager(BaseIDManager):
     """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
     in a way that can be queried by language.
 
@@ -25,37 +25,23 @@ class LanguageManager:
         >>> language_id_mapper = manager.language_ids
     """
 
-    language_id_mapping: Dict = {}
-
     def __init__(
         self,
         language_ids_file_path: str = "",
         config: Coqpit = None,
     ):
-        self.language_id_mapping = {}
-        if language_ids_file_path:
-            self.set_language_ids_from_file(language_ids_file_path)
+        super().__init__(id_file_path=language_ids_file_path)
 
         if config:
             self.set_language_ids_from_config(config)
 
-    @staticmethod
-    def _load_json(json_file_path: str) -> Dict:
-        with fsspec.open(json_file_path, "r") as f:
-            return json.load(f)
-
-    @staticmethod
-    def _save_json(json_file_path: str, data: dict) -> None:
-        with fsspec.open(json_file_path, "w") as f:
-            json.dump(data, f, indent=4)
-
     @property
     def num_languages(self) -> int:
-        return len(list(self.language_id_mapping.keys()))
+        return len(list(self.ids.keys()))
 
     @property
     def language_names(self) -> List:
-        return list(self.language_id_mapping.keys())
+        return list(self.ids.keys())
 
     @staticmethod
     def parse_language_ids_from_config(c: Coqpit) -> Dict:
@@ -79,25 +65,24 @@ class LanguageManager:
         """Set language IDs from config samples.
 
         Args:
-            items (List): Data sampled returned by `load_meta_data()`.
+            c (Coqpit): Config.
         """
-        self.language_id_mapping = self.parse_language_ids_from_config(c)
+        self.ids = self.parse_language_ids_from_config(c)
 
-    def set_language_ids_from_file(self, file_path: str) -> None:
-        """Load language ids from a json file.
+    @staticmethod
+    def parse_ids_from_data(items: List, parse_key: str) -> Any:
+        raise NotImplementedError
 
-        Args:
-            file_path (str): Path to the target json file.
-        """
-        self.language_id_mapping = self._load_json(file_path)
+    def set_ids_from_data(self, items: List, parse_key: str) -> Any:
+        raise NotImplementedError
 
-    def save_language_ids_to_file(self, file_path: str) -> None:
+    def save_ids_to_file(self, file_path: str) -> None:
         """Save language IDs to a json file.
 
         Args:
             file_path (str): Path to the output file.
         """
-        self._save_json(file_path, self.language_id_mapping)
+        self._save_json(file_path, self.ids)
 
     @staticmethod
     def init_from_config(config: Coqpit) -> "LanguageManager":
diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py
new file mode 100644
index 00000000..85ed53cc
--- /dev/null
+++ b/TTS/tts/utils/managers.py
@@ -0,0 +1,285 @@
+import json
+import random
+from typing import Any, Dict, List, Tuple, Union
+
+import fsspec
+import numpy as np
+import torch
+
+from TTS.config import load_config
+from TTS.encoder.utils.generic_utils import setup_encoder_model
+from TTS.utils.audio import AudioProcessor
+
+
+class BaseIDManager:
+    """Base `ID` Manager class. Every new `ID` manager must inherit this.
+    It defines common `ID` manager specific functions.
+    """
+
+    def __init__(self, id_file_path: str = ""):
+        self.ids = {}
+
+        if id_file_path:
+            self.load_ids_from_file(id_file_path)
+
+    @staticmethod
+    def _load_json(json_file_path: str) -> Dict:
+        with fsspec.open(json_file_path, "r") as f:
+            return json.load(f)
+
+    @staticmethod
+    def _save_json(json_file_path: str, data: dict) -> None:
+        with fsspec.open(json_file_path, "w") as f:
+            json.dump(data, f, indent=4)
+
+    def set_ids_from_data(self, items: List, parse_key: str) -> None:
+        """Set IDs from data samples.
+
+        Args:
+            items (List): Data sampled returned by `load_tts_samples()`.
+        """
+        self.ids = self.parse_ids_from_data(items, parse_key=parse_key)
+
+    def load_ids_from_file(self, file_path: str) -> None:
+        """Set IDs from a file.
+
+        Args:
+            file_path (str): Path to the file.
+        """
+        self.ids = self._load_json(file_path)
+
+    def save_ids_to_file(self, file_path: str) -> None:
+        """Save IDs to a json file.
+
+        Args:
+            file_path (str): Path to the output file.
+        """
+        self._save_json(file_path, self.ids)
+
+    def get_random_id(self) -> Any:
+        """Get a random embedding.
+
+        Args:
+
+        Returns:
+            np.ndarray: embedding.
+        """
+        if self.ids:
+            return self.ids[random.choices(list(self.ids.keys()))[0]]
+
+        return None
+
+    @staticmethod
+    def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]:
+        """Parse IDs from data samples retured by `load_tts_samples()`.
+
+        Args:
+            items (list): Data sampled returned by `load_tts_samples()`.
+            parse_key (str): The key to being used to parse the data.
+        Returns:
+            Tuple[Dict]: speaker IDs.
+        """
+        classes = sorted({item[parse_key] for item in items})
+        ids = {name: i for i, name in enumerate(classes)}
+        return ids
+
+
+class EmbeddingManager(BaseIDManager):
+    """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
+    It defines common `Embedding` manager specific functions.
+    """
+
+    def __init__(
+        self,
+        embedding_file_path: str = "",
+        id_file_path: str = "",
+        encoder_model_path: str = "",
+        encoder_config_path: str = "",
+        use_cuda: bool = False,
+    ):
+        super().__init__(id_file_path=id_file_path)
+
+        self.embeddings = {}
+        self.embeddings_by_names = {}
+        self.clip_ids = []
+        self.encoder = None
+        self.encoder_ap = None
+        self.use_cuda = use_cuda
+
+        if embedding_file_path:
+            self.load_embeddings_from_file(embedding_file_path)
+
+        if encoder_model_path and encoder_config_path:
+            self.init_encoder(encoder_model_path, encoder_config_path)
+
+    @property
+    def embedding_dim(self):
+        """Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
+        if self.embeddings:
+            return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
+        return 0
+
+    def save_embeddings_to_file(self, file_path: str) -> None:
+        """Save embeddings to a json file.
+
+        Args:
+            file_path (str): Path to the output file.
+        """
+        self._save_json(file_path, self.embeddings)
+
+    def load_embeddings_from_file(self, file_path: str) -> None:
+        """Load embeddings from a json file.
+
+        Args:
+            file_path (str): Path to the target json file.
+        """
+        self.embeddings = self._load_json(file_path)
+
+        speakers = sorted({x["name"] for x in self.embeddings.values()})
+        self.ids = {name: i for i, name in enumerate(speakers)}
+
+        self.clip_ids = list(set(sorted(clip_name for clip_name in self.embeddings.keys())))
+        # cache embeddings_by_names for fast inference using a bigger speakers.json
+        self.embeddings_by_names = self.get_embeddings_by_names()
+
+    def get_embedding_by_clip(self, clip_idx: str) -> List:
+        """Get embedding by clip ID.
+
+        Args:
+            clip_idx (str): Target clip ID.
+
+        Returns:
+            List: embedding as a list.
+        """
+        return self.embeddings[clip_idx]["embedding"]
+
+    def get_embeddings_by_name(self, idx: str) -> List[List]:
+        """Get all embeddings of a speaker.
+
+        Args:
+            idx (str): Target name.
+
+        Returns:
+            List[List]: all the embeddings of the given speaker.
+        """
+        return self.embeddings_by_names[idx]
+
+    def get_embeddings_by_names(self) -> Dict:
+        """Get all embeddings by names.
+
+        Returns:
+            Dict: all the embeddings of each speaker.
+        """
+        embeddings_by_names = {}
+        for x in self.embeddings.values():
+            if x["name"] not in embeddings_by_names.keys():
+                embeddings_by_names[x["name"]] = [x["embedding"]]
+            else:
+                embeddings_by_names[x["name"]].append(x["embedding"])
+        return embeddings_by_names
+
+    def get_mean_embedding(self, idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
+        """Get mean embedding of a idx.
+
+        Args:
+            idx (str): Target name.
+            num_samples (int, optional): Number of samples to be averaged. Defaults to None.
+            randomize (bool, optional): Pick random `num_samples` of embeddings. Defaults to False.
+
+        Returns:
+            np.ndarray: Mean embedding.
+        """
+        embeddings = self.get_embeddings_by_name(idx)
+        if num_samples is None:
+            embeddings = np.stack(embeddings).mean(0)
+        else:
+            assert len(embeddings) >= num_samples, f" [!] {idx} has number of samples < {num_samples}"
+            if randomize:
+                embeddings = np.stack(random.choices(embeddings, k=num_samples)).mean(0)
+            else:
+                embeddings = np.stack(embeddings[:num_samples]).mean(0)
+        return embeddings
+
+    def get_random_embedding(self) -> Any:
+        """Get a random embedding.
+
+        Args:
+
+        Returns:
+            np.ndarray: embedding.
+        """
+        if self.embeddings:
+            return self.embeddings[random.choices(list(self.embeddings.keys()))[0]]["embedding"]
+
+        return None
+
+    def get_clips(self) -> List:
+        return sorted(self.embeddings.keys())
+
+    def init_encoder(self, model_path: str, config_path: str) -> None:
+        """Initialize a speaker encoder model.
+
+        Args:
+            model_path (str): Model file path.
+            config_path (str): Model config file path.
+        """
+        self.encoder_config = load_config(config_path)
+        self.encoder = setup_encoder_model(self.encoder_config)
+        self.encoder_criterion = self.encoder.load_checkpoint(
+            self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda
+        )
+        self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
+
+    def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:
+        """Compute a embedding from a given audio file.
+
+        Args:
+            wav_file (Union[str, List[str]]): Target file path.
+
+        Returns:
+            list: Computed embedding.
+        """
+
+        def _compute(wav_file: str):
+            waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate)
+            if not self.encoder_config.model_params.get("use_torch_spec", False):
+                m_input = self.encoder_ap.melspectrogram(waveform)
+                m_input = torch.from_numpy(m_input)
+            else:
+                m_input = torch.from_numpy(waveform)
+
+            if self.use_cuda:
+                m_input = m_input.cuda()
+            m_input = m_input.unsqueeze(0)
+            embedding = self.encoder.compute_embedding(m_input)
+            return embedding
+
+        if isinstance(wav_file, list):
+            # compute the mean embedding
+            embeddings = None
+            for wf in wav_file:
+                embedding = _compute(wf)
+                if embeddings is None:
+                    embeddings = embedding
+                else:
+                    embeddings += embedding
+            return (embeddings / len(wav_file))[0].tolist()
+        embedding = _compute(wav_file)
+        return embedding[0].tolist()
+
+    def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
+        """Compute embedding from features.
+
+        Args:
+            feats (Union[torch.Tensor, np.ndarray]): Input features.
+
+        Returns:
+            List: computed embedding.
+        """
+        if isinstance(feats, np.ndarray):
+            feats = torch.from_numpy(feats)
+        if feats.ndim == 2:
+            feats = feats.unsqueeze(0)
+        if self.use_cuda:
+            feats = feats.cuda()
+        return self.encoder.compute_embedding(feats)
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 0227412d..284d0179 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -1,19 +1,17 @@
 import json
 import os
-import random
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Union
 
 import fsspec
 import numpy as np
 import torch
 from coqpit import Coqpit
 
-from TTS.config import get_from_config_or_model_args_with_default, load_config
-from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model
-from TTS.utils.audio import AudioProcessor
+from TTS.config import get_from_config_or_model_args_with_default
+from TTS.tts.utils.managers import EmbeddingManager
 
 
-class SpeakerManager:
+class SpeakerManager(EmbeddingManager):
     """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
     in a way that can be queried by speaker or clip.
 
@@ -50,7 +48,7 @@ class SpeakerManager:
         >>> # load a sample audio and compute embedding
         >>> waveform = ap.load_wav(sample_wav_path)
         >>> mel = ap.melspectrogram(waveform)
-        >>> d_vector = manager.compute_d_vector(mel.T)
+        >>> d_vector = manager.compute_embeddings(mel.T)
     """
 
     def __init__(
@@ -62,279 +60,27 @@ class SpeakerManager:
         encoder_config_path: str = "",
         use_cuda: bool = False,
     ):
-
-        self.d_vectors = {}
-        self.speaker_ids = {}
-        self.d_vectors_by_speakers = {}
-        self.clip_ids = []
-        self.speaker_encoder = None
-        self.speaker_encoder_ap = None
-        self.use_cuda = use_cuda
+        super().__init__(
+            embedding_file_path=d_vectors_file_path,
+            id_file_path=speaker_id_file_path,
+            encoder_model_path=encoder_model_path,
+            encoder_config_path=encoder_config_path,
+            use_cuda=use_cuda,
+        )
 
         if data_items:
-            self.speaker_ids, _ = self.parse_speakers_from_data(data_items)
-
-        if d_vectors_file_path:
-            self.set_d_vectors_from_file(d_vectors_file_path)
-
-        if speaker_id_file_path:
-            self.set_speaker_ids_from_file(speaker_id_file_path)
-
-        if encoder_model_path and encoder_config_path:
-            self.init_speaker_encoder(encoder_model_path, encoder_config_path)
-
-    @staticmethod
-    def _load_json(json_file_path: str) -> Dict:
-        with fsspec.open(json_file_path, "r") as f:
-            return json.load(f)
-
-    @staticmethod
-    def _save_json(json_file_path: str, data: dict) -> None:
-        with fsspec.open(json_file_path, "w") as f:
-            json.dump(data, f, indent=4)
+            self.set_ids_from_data(data_items, parse_key="speaker_name")
 
     @property
     def num_speakers(self):
-        return len(self.speaker_ids)
+        return len(self.ids)
 
     @property
     def speaker_names(self):
-        return list(self.speaker_ids.keys())
-
-    @property
-    def d_vector_dim(self):
-        """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero."""
-        if self.d_vectors:
-            return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"])
-        return 0
-
-    @staticmethod
-    def parse_speakers_from_data(items: list) -> Tuple[Dict, int]:
-        """Parse speaker IDs from data samples retured by `load_tts_samples()`.
-
-        Args:
-            items (list): Data sampled returned by `load_tts_samples()`.
-
-        Returns:
-            Tuple[Dict, int]: speaker IDs and number of speakers.
-        """
-        speakers = sorted({item["speaker_name"] for item in items})
-        speaker_ids = {name: i for i, name in enumerate(speakers)}
-        num_speakers = len(speaker_ids)
-        return speaker_ids, num_speakers
-
-    def set_speaker_ids_from_data(self, items: List) -> None:
-        """Set speaker IDs from data samples.
-
-        Args:
-            items (List): Data sampled returned by `load_tts_samples()`.
-        """
-        self.speaker_ids, _ = self.parse_speakers_from_data(items)
-
-    def set_speaker_ids_from_file(self, file_path: str) -> None:
-        """Set speaker IDs from a file.
-
-        Args:
-            file_path (str): Path to the file.
-        """
-        self.speaker_ids = self._load_json(file_path)
-
-    def save_speaker_ids_to_file(self, file_path: str) -> None:
-        """Save speaker IDs to a json file.
-
-        Args:
-            file_path (str): Path to the output file.
-        """
-        self._save_json(file_path, self.speaker_ids)
-
-    def save_d_vectors_to_file(self, file_path: str) -> None:
-        """Save d_vectors to a json file.
-
-        Args:
-            file_path (str): Path to the output file.
-        """
-        self._save_json(file_path, self.d_vectors)
-
-    def set_d_vectors_from_file(self, file_path: str) -> None:
-        """Load d_vectors from a json file.
-
-        Args:
-            file_path (str): Path to the target json file.
-        """
-        self.d_vectors = self._load_json(file_path)
-
-        speakers = sorted({x["name"] for x in self.d_vectors.values()})
-        self.speaker_ids = {name: i for i, name in enumerate(speakers)}
-
-        self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys())))
-        # cache d_vectors_by_speakers for fast inference using a bigger speakers.json
-        self.d_vectors_by_speakers = self.get_d_vectors_by_speakers()
-
-    def get_d_vector_by_clip(self, clip_idx: str) -> List:
-        """Get d_vector by clip ID.
-
-        Args:
-            clip_idx (str): Target clip ID.
-
-        Returns:
-            List: d_vector as a list.
-        """
-        return self.d_vectors[clip_idx]["embedding"]
-
-    def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]:
-        """Get all d_vectors of a speaker.
-
-        Args:
-            speaker_idx (str): Target speaker ID.
-
-        Returns:
-            List[List]: all the d_vectors of the given speaker.
-        """
-        return self.d_vectors_by_speakers[speaker_idx]
-
-    def get_d_vectors_by_speakers(self) -> Dict:
-        """Get all d_vectors by speaker.
-
-        Returns:
-            Dict: all the d_vectors of each speaker.
-        """
-        d_vectors_by_speakers = {}
-        for x in self.d_vectors.values():
-            if x["name"] not in d_vectors_by_speakers.keys():
-                d_vectors_by_speakers[x["name"]] = [x["embedding"]]
-            else:
-                d_vectors_by_speakers[x["name"]].append(x["embedding"])
-        return d_vectors_by_speakers
-
-    def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
-        """Get mean d_vector of a speaker ID.
-
-        Args:
-            speaker_idx (str): Target speaker ID.
-            num_samples (int, optional): Number of samples to be averaged. Defaults to None.
-            randomize (bool, optional): Pick random `num_samples` of d_vectors. Defaults to False.
-
-        Returns:
-            np.ndarray: Mean d_vector.
-        """
-        d_vectors = self.get_d_vectors_by_speaker(speaker_idx)
-        if num_samples is None:
-            d_vectors = np.stack(d_vectors).mean(0)
-        else:
-            assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
-            if randomize:
-                d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0)
-            else:
-                d_vectors = np.stack(d_vectors[:num_samples]).mean(0)
-        return d_vectors
-
-    def get_random_speaker_id(self) -> Any:
-        """Get a random d_vector.
-
-        Args:
-
-        Returns:
-            np.ndarray: d_vector.
-        """
-        if self.speaker_ids:
-            return self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]]
-
-        return None
-
-    def get_random_d_vector(self) -> Any:
-        """Get a random D  ID.
-
-        Args:
-
-        Returns:
-            np.ndarray: d_vector.
-        """
-        if self.d_vectors:
-            return self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"]
-
-        return None
+        return list(self.ids.keys())
 
     def get_speakers(self) -> List:
-        return self.speaker_ids
-
-    def get_clips(self) -> List:
-        return sorted(self.d_vectors.keys())
-
-    def init_speaker_encoder(self, model_path: str, config_path: str) -> None:
-        """Initialize a speaker encoder model.
-
-        Args:
-            model_path (str): Model file path.
-            config_path (str): Model config file path.
-        """
-        self.speaker_encoder_config = load_config(config_path)
-        self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config)
-        self.speaker_encoder_criterion = self.speaker_encoder.load_checkpoint(
-            self.speaker_encoder_config, model_path, eval=True, use_cuda=self.use_cuda
-        )
-        self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
-
-    def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list:
-        """Compute a d_vector from a given audio file.
-
-        Args:
-            wav_file (Union[str, List[str]]): Target file path.
-
-        Returns:
-            list: Computed d_vector.
-        """
-
-        def _compute(wav_file: str):
-            waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate)
-            if not self.speaker_encoder_config.model_params.get("use_torch_spec", False):
-                m_input = self.speaker_encoder_ap.melspectrogram(waveform)
-                m_input = torch.from_numpy(m_input)
-            else:
-                m_input = torch.from_numpy(waveform)
-
-            if self.use_cuda:
-                m_input = m_input.cuda()
-            m_input = m_input.unsqueeze(0)
-            d_vector = self.speaker_encoder.compute_embedding(m_input)
-            return d_vector
-
-        if isinstance(wav_file, list):
-            # compute the mean d_vector
-            d_vectors = None
-            for wf in wav_file:
-                d_vector = _compute(wf)
-                if d_vectors is None:
-                    d_vectors = d_vector
-                else:
-                    d_vectors += d_vector
-            return (d_vectors / len(wav_file))[0].tolist()
-        d_vector = _compute(wav_file)
-        return d_vector[0].tolist()
-
-    def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
-        """Compute d_vector from features.
-
-        Args:
-            feats (Union[torch.Tensor, np.ndarray]): Input features.
-
-        Returns:
-            List: computed d_vector.
-        """
-        if isinstance(feats, np.ndarray):
-            feats = torch.from_numpy(feats)
-        if feats.ndim == 2:
-            feats = feats.unsqueeze(0)
-        if self.use_cuda:
-            feats = feats.cuda()
-        return self.speaker_encoder.compute_embedding(feats)
-
-    def run_umap(self):
-        # TODO: implement speaker encoder
-        raise NotImplementedError
-
-    def plot_embeddings(self):
-        # TODO: implement speaker encoder
-        raise NotImplementedError
+        return self.ids
 
     @staticmethod
     def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
@@ -420,7 +166,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
     speaker_manager = SpeakerManager()
     if c.use_speaker_embedding:
         if data is not None:
-            speaker_manager.set_speaker_ids_from_data(data)
+            speaker_manager.set_ids_from_data(data, parse_key="speaker_name")
         if restore_path:
             speakers_file = _set_file_path(restore_path)
             # restoring speaker manager from a previous run.
@@ -432,27 +178,27 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
                         raise RuntimeError(
                             "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file"
                         )
-                    speaker_manager.load_d_vectors_file(c.d_vector_file)
-                speaker_manager.set_d_vectors_from_file(speakers_file)
+                    speaker_manager.load_embeddings_from_file(c.d_vector_file)
+                speaker_manager.load_embeddings_from_file(speakers_file)
             elif not c.use_d_vector_file:  # restor speaker manager with speaker ID file.
-                speaker_ids_from_data = speaker_manager.speaker_ids
-                speaker_manager.set_speaker_ids_from_file(speakers_file)
+                speaker_ids_from_data = speaker_manager.ids
+                speaker_manager.load_ids_from_file(speakers_file)
                 assert all(
-                    speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data
+                    speaker in speaker_manager.ids for speaker in speaker_ids_from_data
                 ), " [!] You cannot introduce new speakers to a pre-trained model."
         elif c.use_d_vector_file and c.d_vector_file:
             # new speaker manager with external speaker embeddings.
-            speaker_manager.set_d_vectors_from_file(c.d_vector_file)
+            speaker_manager.load_embeddings_from_file(c.d_vector_file)
         elif c.use_d_vector_file and not c.d_vector_file:
             raise "use_d_vector_file is True, so you need pass a external speaker embedding file."
         elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file:
             # new speaker manager with speaker IDs file.
-            speaker_manager.set_speaker_ids_from_file(c.speakers_file)
+            speaker_manager.load_ids_from_file(c.speakers_file)
 
         if speaker_manager.num_speakers > 0:
             print(
                 " > Speaker manager is loaded with {} speakers: {}".format(
-                    speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids)
+                    speaker_manager.num_speakers, ", ".join(speaker_manager.ids)
                 )
             )
 
@@ -461,9 +207,9 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
             out_file_path = os.path.join(out_path, "speakers.json")
             print(f" > Saving `speakers.json` to {out_file_path}.")
             if c.use_d_vector_file and c.d_vector_file:
-                speaker_manager.save_d_vectors_to_file(out_file_path)
+                speaker_manager.save_embeddings_to_file(out_file_path)
             else:
-                speaker_manager.save_speaker_ids_to_file(out_file_path)
+                speaker_manager.save_ids_to_file(out_file_path)
     return speaker_manager
 
 
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index eef4086c..1a49f0b0 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -122,7 +122,7 @@ class Synthesizer(object):
             self.tts_model.cuda()
 
         if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"):
-            self.tts_model.speaker_manager.init_speaker_encoder(self.encoder_checkpoint, self.encoder_config)
+            self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config)
 
     def _set_speaker_encoder_paths_from_tts_config(self):
         """Set the encoder paths from the tts model config for models with speaker encoders."""
@@ -212,17 +212,17 @@ class Synthesizer(object):
         # handle multi-speaker
         speaker_embedding = None
         speaker_id = None
-        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
+        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
             if speaker_name and isinstance(speaker_name, str):
                 if self.tts_config.use_d_vector_file:
                     # get the average speaker embedding from the saved d_vectors.
-                    speaker_embedding = self.tts_model.speaker_manager.get_mean_d_vector(
+                    speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
                         speaker_name, num_samples=None, randomize=False
                     )
                     speaker_embedding = np.array(speaker_embedding)[None, :]  # [1 x embedding_dim]
                 else:
                     # get speaker idx from the speaker name
-                    speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_name]
+                    speaker_id = self.tts_model.speaker_manager.ids[speaker_name]
 
             elif not speaker_name and not speaker_wav:
                 raise ValueError(
@@ -244,7 +244,7 @@ class Synthesizer(object):
             hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
         ):
             if language_name and isinstance(language_name, str):
-                language_id = self.tts_model.language_manager.language_id_mapping[language_name]
+                language_id = self.tts_model.language_manager.ids[language_name]
 
             elif not language_name:
                 raise ValueError(
@@ -260,7 +260,7 @@ class Synthesizer(object):
 
         # compute a new d_vector from the given clip.
         if speaker_wav is not None:
-            speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(speaker_wav)
+            speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
 
         use_gl = self.vocoder_model is None
 
@@ -319,7 +319,7 @@ class Synthesizer(object):
                 if reference_speaker_name and isinstance(reference_speaker_name, str):
                     if self.tts_config.use_d_vector_file:
                         # get the speaker embedding from the saved d_vectors.
-                        reference_speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(
+                        reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name(
                             reference_speaker_name
                         )[0]
                         reference_speaker_embedding = np.array(reference_speaker_embedding)[
@@ -327,9 +327,9 @@ class Synthesizer(object):
                         ]  # [1 x embedding_dim]
                     else:
                         # get speaker idx from the speaker name
-                        reference_speaker_id = self.tts_model.speaker_manager.speaker_ids[reference_speaker_name]
+                        reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name]
                 else:
-                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(
+                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
                         reference_wav
                     )
 
diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py
index 94692f00..0e650ade 100644
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@@ -119,7 +119,7 @@ train_samples, eval_samples = load_tts_samples(
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 language_manager = LanguageManager(config=config)
diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py
index 05cdc72a..c39932da 100644
--- a/recipes/vctk/fast_pitch/train_fast_pitch.py
+++ b/recipes/vctk/fast_pitch/train_fast_pitch.py
@@ -81,7 +81,7 @@ train_samples, eval_samples = load_tts_samples(
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 # init model
diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py
index a294272a..a3249de1 100644
--- a/recipes/vctk/fast_speech/train_fast_speech.py
+++ b/recipes/vctk/fast_speech/train_fast_speech.py
@@ -79,7 +79,7 @@ train_samples, eval_samples = load_tts_samples(
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 # init model
diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py
index 0bf686b1..23c02efc 100644
--- a/recipes/vctk/glow_tts/train_glow_tts.py
+++ b/recipes/vctk/glow_tts/train_glow_tts.py
@@ -79,7 +79,7 @@ train_samples, eval_samples = load_tts_samples(
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.num_speakers = speaker_manager.num_speakers
 
 # init model
diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py
index 4208a9b6..bcd0105a 100644
--- a/recipes/vctk/speedy_speech/train_speedy_speech.py
+++ b/recipes/vctk/speedy_speech/train_speedy_speech.py
@@ -79,7 +79,7 @@ train_samples, eval_samples = load_tts_samples(
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 # init model
diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
index d67038a4..36e28ed7 100644
--- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
+++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
@@ -82,7 +82,7 @@ train_samples, eval_samples = load_tts_samples(
 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 
 # init model
 model = Tacotron(config, ap, tokenizer, speaker_manager)
diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
index b860df85..d04d91c0 100644
--- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
+++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
@@ -88,7 +88,7 @@ train_samples, eval_samples = load_tts_samples(
 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 
 # init model
 model = Tacotron2(config, ap, tokenizer, speaker_manager)
diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py
index d27dd78c..5a0e157a 100644
--- a/recipes/vctk/tacotron2/train_tacotron2.py
+++ b/recipes/vctk/tacotron2/train_tacotron2.py
@@ -88,7 +88,7 @@ train_samples, eval_samples = load_tts_samples(
 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 
 # init model
 model = Tacotron2(config, ap, tokenizer, speaker_manager)
diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py
index 61d60ca1..88fd7de9 100644
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@@ -89,7 +89,7 @@ train_samples, eval_samples = load_tts_samples(
 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.model_args.num_speakers = speaker_manager.num_speakers
 
 # init model
diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py
index 57ff6c50..7552e0a5 100644
--- a/tests/aux_tests/test_speaker_manager.py
+++ b/tests/aux_tests/test_speaker_manager.py
@@ -6,7 +6,7 @@ import torch
 
 from tests import get_tests_input_path
 from TTS.config import load_config
-from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model
+from TTS.encoder.utils.generic_utils import setup_encoder_model
 from TTS.encoder.utils.io import save_checkpoint
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
@@ -28,7 +28,7 @@ class SpeakerManagerTest(unittest.TestCase):
         config.audio.resample = True
 
         # create a dummy speaker encoder
-        model = setup_speaker_encoder_model(config)
+        model = setup_encoder_model(config)
         save_checkpoint(model, None, None, get_tests_input_path(), 0)
 
         # load audio processor and speaker encoder
@@ -38,19 +38,19 @@ class SpeakerManagerTest(unittest.TestCase):
         # load a sample audio and compute embedding
         waveform = ap.load_wav(sample_wav_path)
         mel = ap.melspectrogram(waveform)
-        d_vector = manager.compute_d_vector(mel)
+        d_vector = manager.compute_embeddings(mel)
         assert d_vector.shape[1] == 256
 
         # compute d_vector directly from an input file
-        d_vector = manager.compute_d_vector_from_clip(sample_wav_path)
-        d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path)
+        d_vector = manager.compute_embedding_from_clip(sample_wav_path)
+        d_vector2 = manager.compute_embedding_from_clip(sample_wav_path)
         d_vector = torch.FloatTensor(d_vector)
         d_vector2 = torch.FloatTensor(d_vector2)
         assert d_vector.shape[0] == 256
         assert (d_vector - d_vector2).sum() == 0.0
 
         # compute d_vector from a list of wav files.
-        d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2])
+        d_vector3 = manager.compute_embedding_from_clip([sample_wav_path, sample_wav_path2])
         d_vector3 = torch.FloatTensor(d_vector3)
         assert d_vector3.shape[0] == 256
         assert (d_vector - d_vector3).sum() != 0.0
@@ -62,14 +62,14 @@ class SpeakerManagerTest(unittest.TestCase):
     def test_speakers_file_processing():
         manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path)
         print(manager.num_speakers)
-        print(manager.d_vector_dim)
+        print(manager.embedding_dim)
         print(manager.clip_ids)
-        d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0])
+        d_vector = manager.get_embedding_by_clip(manager.clip_ids[0])
         assert len(d_vector) == 256
-        d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_names[0])
+        d_vectors = manager.get_embeddings_by_name(manager.speaker_names[0])
         assert len(d_vectors[0]) == 256
-        d_vector1 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=True)
+        d_vector1 = manager.get_mean_embedding(manager.speaker_names[0], num_samples=2, randomize=True)
         assert len(d_vector1) == 256
-        d_vector2 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=False)
+        d_vector2 = manager.get_mean_embedding(manager.speaker_names[0], num_samples=2, randomize=False)
         assert len(d_vector2) == 256
         assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0
diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py
index 2783e4bd..2a723f10 100644
--- a/tests/tts_tests/test_glow_tts.py
+++ b/tests/tts_tests/test_glow_tts.py
@@ -86,7 +86,7 @@ class TestGlowTTS(unittest.TestCase):
         model = GlowTTS(config)
         model.speaker_manager = speaker_manager
         model.init_multispeaker(config)
-        self.assertEqual(model.c_in_channels, speaker_manager.d_vector_dim)
+        self.assertEqual(model.c_in_channels, speaker_manager.embedding_dim)
         self.assertEqual(model.num_speakers, speaker_manager.num_speakers)
 
     def test_unlock_act_norm_layers(self):
diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
index 05adb9ed..de683c81 100644
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@@ -7,7 +7,7 @@ from trainer.logging.tensorboard_logger import TensorboardLogger
 
 from tests import assertHasAttr, assertHasNotAttr, get_tests_data_path, get_tests_input_path, get_tests_output_path
 from TTS.config import load_config
-from TTS.encoder.utils.generic_utils import setup_speaker_encoder_model
+from TTS.encoder.utils.generic_utils import setup_encoder_model
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.models.vits import Vits, VitsArgs, amp_to_db, db_to_amp, load_audio, spec_to_mel, wav_to_mel, wav_to_spec
 from TTS.tts.utils.speakers import SpeakerManager
@@ -242,9 +242,9 @@ class TestVits(unittest.TestCase):
 
         speaker_encoder_config = load_config(SPEAKER_ENCODER_CONFIG)
         speaker_encoder_config.model_params["use_torch_spec"] = True
-        speaker_encoder = setup_speaker_encoder_model(speaker_encoder_config).to(device)
+        speaker_encoder = setup_encoder_model(speaker_encoder_config).to(device)
         speaker_manager = SpeakerManager()
-        speaker_manager.speaker_encoder = speaker_encoder
+        speaker_manager.encoder = speaker_encoder
 
         args = VitsArgs(
             language_ids_file=LANG_FILE,
diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py
index 63d9e7ca..e614ce74 100644
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@@ -38,7 +38,7 @@ def test_run_all_models():
                     language_manager = LanguageManager(language_ids_file_path=language_files[0])
                     language_id = language_manager.language_names[0]
 
-                speaker_id = list(speaker_manager.speaker_ids.keys())[0]
+                speaker_id = list(speaker_manager.ids.keys())[0]
                 run_cli(
                     f"tts --model_name  {model_name} "
                     f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" '

From 164c7dd67618792bfcb3a5605ed222f74b539001 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Fri, 8 Apr 2022 14:47:09 +0200
Subject: [PATCH 31/38] Update requirements coqui_trainer -> trainer (#1478)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index db47c2cc..50c0d2ac 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,7 +24,7 @@ matplotlib
 tensorboardX
 pyworld
 # coqui stack
-coqui-trainer
+trainer
 coqpit # config management
 # chinese g2p deps
 jieba

From 27fcb5dabf265e74ee463c3fc11c4709ca5e7b25 Mon Sep 17 00:00:00 2001
From: Reuben Morais <reuben.morais@gmail.com>
Date: Fri, 15 Apr 2022 01:13:32 +0200
Subject: [PATCH 32/38] Add Dockerfile and build/push CI

---
 .dockerignore                 |  3 +-
 .github/workflows/docker.yaml | 56 +++++++++++++++++++++++++++++++++++
 Dockerfile                    | 11 +++++++
 3 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/docker.yaml
 create mode 100644 Dockerfile

diff --git a/.dockerignore b/.dockerignore
index 4032ec6b..2833d344 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1 +1,2 @@
-.git/
\ No newline at end of file
+.git/
+Dockerfile
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
new file mode 100644
index 00000000..457649a2
--- /dev/null
+++ b/.github/workflows/docker.yaml
@@ -0,0 +1,56 @@
+name: "Docker build and push"
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - dev
+    tags:
+      - v*
+jobs:
+  docker-build:
+    name: "Build and push Docker image"
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        arch: ["amd64"]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Log in to the Container registry
+        uses: docker/login-action@v1
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Compute Docker tags, check VERSION file matches tag
+        id: compute-tag
+        run: |
+          set -ex
+          base="ghcr.io/coqui-ai/tts"
+          tags="" # PR build
+          if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
+            # Push to branch
+            github_ref="${{ github.ref }}"
+            branch=${github_ref#*refs/heads/} # strip prefix to get branch name
+            tags="${base}:${branch},${base}:${{ github.sha }},"
+          elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then
+            VERSION="v$(cat TTS/VERSION)"
+            if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then
+              echo "Pushed tag does not match VERSION file. Aborting push."
+              exit 1
+            fi
+            tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}"
+          fi
+          echo "::set-output name=tags::${tags}"
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Build and push
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          platforms: linux/${{ matrix.arch }}
+          push: ${{ github.event_name == 'push' }}
+          tags: ${{ steps.compute-tag.outputs.tags }}
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..8dab3b30
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,11 @@
+FROM nvcr.io/nvidia/pytorch:22.03-py3
+RUN apt-get update && apt-get install -y --no-install-recommends espeak && rm -rf /var/lib/apt/lists/*
+WORKDIR /root
+COPY requirements.txt /root
+COPY requirements.dev.txt /root
+COPY requirements.notebooks.txt /root
+RUN pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)
+COPY . /root
+RUN make install
+ENTRYPOINT ["tts"]
+CMD ["--help"]

From e8573bfe3e692613920a0199c984deed7f0d9cfe Mon Sep 17 00:00:00 2001
From: jackiexiao <707610215@qq.com>
Date: Fri, 15 Apr 2022 20:43:46 +0800
Subject: [PATCH 33/38] Update CONTRIBUTING.md (#1463)

fix header
```
## Call for sharing language models
```
---
 CONTRIBUTING.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7175cf34..81a426e8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -26,7 +26,8 @@ If you like to contribute code, squash a bug but if you don't know where to star
     We list all the target improvements for the next version. You can pick one of them and start contributing.
 
 - Also feel free to suggest new features, ideas and models. We're always open for new things.
-#####Call for sharing language models
+
+## Call for sharing language models
 If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
 
 This model can be shared in two ways:
@@ -36,6 +37,7 @@ This model can be shared in two ways:
 Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
 
 Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380). 
+
 ## Sending a ✨**PR**✨
 
 If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨.

From 4953636b1466a5e9fd5e73aa9afeaaeea8bb19dd Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Tue, 19 Apr 2022 14:18:30 +0200
Subject: [PATCH 34/38] Add African models (#1511)

* Add african models

* Set default license for all models
---
 TTS/.models.json | 112 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 99 insertions(+), 13 deletions(-)

diff --git a/TTS/.models.json b/TTS/.models.json
index 24838a5d..4870bc1f 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -18,7 +18,8 @@
                     "description": "EK1 en-rp tacotron2 by NMStoker",
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
                     "default_vocoder": "vocoder_models/en/ek1/wavegrad",
-                    "commit": "c802255"
+                    "commit": "c802255",
+                    "license": "apache 2.0"
                 }
             },
             "ljspeech": {
@@ -28,7 +29,7 @@
                     "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                     "commit": "bae2ad0f",
                     "author": "Eren Gölge @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.com"
                 },
                 "tacotron2-DDC_ph": {
@@ -37,7 +38,7 @@
                     "default_vocoder": "vocoder_models/en/ljspeech/univnet",
                     "commit": "3900448",
                     "author": "Eren Gölge @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.com"
                 },
                 "glow-tts": {
@@ -57,7 +58,7 @@
                     "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                     "commit": "4581e3d",
                     "author": "Eren Gölge @erogol",
-                    "license": "TBD",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.com"
                 },
                 "tacotron2-DCA": {
@@ -75,7 +76,7 @@
                     "default_vocoder": null,
                     "commit": "3900448",
                     "author": "Eren Gölge @erogol",
-                    "license": "TBD",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.com"
                 },
                 "fast_pitch": {
@@ -84,7 +85,7 @@
                     "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                     "commit": "b27b3ba",
                     "author": "Eren Gölge @erogol",
-                    "license": "TBD",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.com"
                 }
             },
@@ -95,7 +96,7 @@
                     "default_vocoder": null,
                     "commit": "3900448",
                     "author": "Eren @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.ai"
                 },
                 "fast_pitch":{
@@ -115,7 +116,7 @@
                     "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
                     "commit": "bae2ad0f",
                     "author": "Eren Gölge @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.com"
                 }
             }
@@ -162,6 +163,7 @@
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
                     "commit": "unknown",
                     "author": "@kirianguiller",
+                    "license": "apache 2.0",
                     "default_vocoder": null
                 }
             }
@@ -171,6 +173,7 @@
                 "tacotron2-DDC": {
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
                     "author": "@r-dh",
+                    "license": "apache 2.0",
                     "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
                     "stats_file": null,
                     "commit": "540d811"
@@ -183,6 +186,7 @@
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
                     "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
                     "author": "@thorstenMueller",
+                    "license": "apache 2.0",
                     "commit": "unknown"
                 }
             }
@@ -194,6 +198,7 @@
                     "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
                     "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
                     "author": "@kaiidams",
+                    "license": "apache 2.0",
                     "commit": "401fbd89"
                 }
             }
@@ -217,6 +222,7 @@
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
+                    "license": "apache 2.0",
                     "commit": null
                 },
                 "vits":{
@@ -224,6 +230,7 @@
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
+                    "license": "apache 2.0",
                     "commit": null
                 }
             },
@@ -233,6 +240,7 @@
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
+                    "license": "apache 2.0",
                     "commit": null
                 },
                 "vits":{
@@ -240,9 +248,82 @@
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
+                    "license": "apache 2.0",
                     "commit": null
                 }
             }
+        },
+        "ewe": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "hau": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "lin": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "tw_akuapem": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "tw_asante": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "yor": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
         }
     },
     "vocoder_models": {
@@ -269,7 +350,8 @@
                 "wavegrad": {
                     "description": "EK1 en-rp wavegrad by NMStoker",
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
-                    "commit": "c802255"
+                    "commit": "c802255",
+                    "license": "apache 2.0"
                 }
             },
             "ljspeech": {
@@ -285,7 +367,7 @@
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
                     "commit": "bae2ad0f",
                     "author": "@erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.ai"
                 },
                 "univnet": {
@@ -293,7 +375,7 @@
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
                     "commit": "4581e3d",
                     "author": "Eren @erogol",
-                    "license": "TBD",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.ai"
                 }
             },
@@ -303,7 +385,7 @@
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
                     "commit": "2f07160",
                     "author": "Edresson Casanova",
-                    "license": "",
+                    "license": "apache 2.0",
                     "contact": ""
                 }
             },
@@ -313,7 +395,7 @@
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
                     "commit": "2f07160",
                     "author": "Eren Gölge @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                     "contact": "egolge@coqui.ai"
                 }
             }
@@ -323,6 +405,7 @@
                 "parallel-wavegan": {
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
                     "author": "@r-dh",
+                    "license": "apache 2.0",
                     "commit": "unknown"
                 }
             }
@@ -332,11 +415,13 @@
                 "wavegrad": {
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
                     "author": "@thorstenMueller",
+                    "license": "apache 2.0",
                     "commit": "unknown"
                 },
                 "fullband-melgan": {
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
                     "author": "@thorstenMueller",
+                    "license": "apache 2.0",
                     "commit": "unknown"
                 }
             }
@@ -347,6 +432,7 @@
                     "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
                     "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
                     "author": "@kaiidams",
+                    "license": "apache 2.0",
                     "commit": "3900448"
                 }
             }

From 7133f8f47d6c0ed0ce4c3beefeb8112ce94d7f6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Tue, 19 Apr 2022 14:18:49 +0200
Subject: [PATCH 35/38] Print Model's license when downloading   (#1512)

* Print model license while downloading

* Make style

* Add a new license link

* Make style
---
 TTS/utils/manage.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 674d5a47..0ef3675b 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -4,13 +4,23 @@ import os
 import zipfile
 from pathlib import Path
 from shutil import copyfile, rmtree
-from typing import Tuple
+from typing import Dict, Tuple
 
 import requests
 
 from TTS.config import load_config
 from TTS.utils.generic_utils import get_user_data_dir
 
+LICENSE_URLS = {
+    "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
+    "mpl": "https://www.mozilla.org/en-US/MPL/2.0/",
+    "mpl2": "https://www.mozilla.org/en-US/MPL/2.0/",
+    "mit": "https://choosealicense.com/licenses/mit/",
+    "apache2.0": "https://choosealicense.com/licenses/apache-2.0/",
+    "apache2": "https://choosealicense.com/licenses/apache-2.0/",
+    "cc-by-sa 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
+}
+
 
 class ModelManager(object):
     """Manage TTS models defined in .models.json.
@@ -108,6 +118,22 @@ class ModelManager(object):
                 for dataset in self.models_dict[model_type][lang]:
                     print(f" >: {model_type}/{lang}/{dataset}")
 
+    @staticmethod
+    def print_model_license(model_item: Dict):
+        """Print the license of a model
+
+        Args:
+            model_item (dict): model item in the models.json
+        """
+        if "license" in model_item and model_item["license"].strip() != "":
+            print(f" > Model's license - {model_item['license']}")
+            if model_item["license"].lower() in LICENSE_URLS:
+                print(f" > Check {LICENSE_URLS[model_item['license'].lower()]} for more info.")
+            else:
+                print(" > Check https://opensource.org/licenses for more info.")
+        else:
+            print(" > Model's license - No license information available")
+
     def download_model(self, model_name):
         """Download model files given the full model name.
         Model name is in the format
@@ -135,6 +161,7 @@ class ModelManager(object):
             print(f" > Downloading model to {output_path}")
             # download from github release
             self._download_zip_file(model_item["github_rls_url"], output_path)
+            self.print_model_license(model_item=model_item)
         # find downloaded files
         output_model_path, output_config_path = self._find_files(output_path)
         # update paths in the config.json

From b45d5c5c60fc4399af67f2281fb92667de7b0b57 Mon Sep 17 00:00:00 2001
From: Yanlong Wang <yanlong.wang@naiver.org>
Date: Tue, 19 Apr 2022 20:24:34 +0800
Subject: [PATCH 36/38] Improve docsQA default questions (#1411)

---
 docs/source/_templates/page.html | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/_templates/page.html b/docs/source/_templates/page.html
index aab3d977..2c6ef4ee 100644
--- a/docs/source/_templates/page.html
+++ b/docs/source/_templates/page.html
@@ -13,9 +13,9 @@
     <template>
         <dl>
             <dt>You can ask questions about TTS. Try</dt>
-            <dd>how to install TTS for Python?</dd>
-            <dd>What data format for the training dataset?</dd>
-            <dd>what makes a good dataset?</dd>
+            <dd>What is VITS?</dd>
+            <dd>How to train a TTS model?</dd>
+            <dd>What is the format of training data?</dd>
         </dl>
     </template>
     </qa-bot>

From 30bea7d53cacffb8732dcdf51b053952005aea1d Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Tue, 19 Apr 2022 14:27:32 +0200
Subject: [PATCH 37/38] Update manage.py (#1514)

---
 TTS/utils/manage.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 0ef3675b..87cb5592 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -15,8 +15,9 @@ LICENSE_URLS = {
     "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
     "mpl": "https://www.mozilla.org/en-US/MPL/2.0/",
     "mpl2": "https://www.mozilla.org/en-US/MPL/2.0/",
+    "mpl 2.0": "https://www.mozilla.org/en-US/MPL/2.0/",
     "mit": "https://choosealicense.com/licenses/mit/",
-    "apache2.0": "https://choosealicense.com/licenses/apache-2.0/",
+    "apache 2.0": "https://choosealicense.com/licenses/apache-2.0/",
     "apache2": "https://choosealicense.com/licenses/apache-2.0/",
     "cc-by-sa 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
 }

From c410bc58ef3bd07b72ab05d29bbdc2a6df47afea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Wed, 20 Apr 2022 11:46:26 +0200
Subject: [PATCH 38/38] Bump to v0.6.2

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 7ceb0404..b1d7abc0 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.6.1
\ No newline at end of file
+0.6.2
\ No newline at end of file