Merge pull request #508 from Edresson/dev

Implement/Train a better speaker encoder
2021-05-31 11:45:36 +02:00 · 2021-05-31 11:45:36 +02:00 · 92e26b4216
parent df6a98d0c3 cc192b6843
commit 92e26b4216
22 changed files with 2788 additions and 722 deletions
--- a/.gitignore
+++ b/.gitignore
@ -132,3 +132,7 @@ notebooks/data/*
 TTS/tts/layers/glow_tts/monotonic_align/core.c
 .vscode-upload.json
 temp_build/*
+recipes/*
+
+# nohup logs
+*.out
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -2,15 +2,14 @@ import argparse
 import glob
 import os

-import numpy as np
 import torch
 from tqdm import tqdm

-from TTS.speaker_encoder.model import SpeakerEncoder
+from TTS.speaker_encoder.utils.generic_utils import setup_model
 from TTS.tts.datasets.preprocess import load_meta_data
-from TTS.tts.utils.speakers import save_speaker_mapping
+from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
-from TTS.utils.io import load_config
+from TTS.config import load_config, BaseDatasetConfig

 parser = argparse.ArgumentParser(
    description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.'
@ -22,14 +21,14 @@ parser.add_argument(
    help="Path to config file for training.",
 )
 parser.add_argument("data_path", type=str, help="Data path for wav files - directory or CSV file")
-parser.add_argument("output_path", type=str, help="path for training outputs.")
+parser.add_argument("output_path", type=str, help="path for output speakers.json.")
 parser.add_argument(
    "--target_dataset",
    type=str,
    default="",
    help="Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.",
 )
-parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=False)
+parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
 parser.add_argument("--separator", type=str, help="Separator used in file if CSV is passed for data_path", default="|")
 args = parser.parse_args()

@ -44,10 +43,9 @@ sep = args.separator
 if args.target_dataset != "":
    # if target dataset is defined
    dataset_config = [
-        {"name": args.target_dataset, "path": args.data_path, "meta_file_train": None, "meta_file_val": None},
+        BaseDatasetConfig(name=args.target_dataset, path=args.data_path, meta_file_train=None, meta_file_val=None),
    ]
    wav_files, _ = load_meta_data(dataset_config, eval_split=False)
-    output_files = [wav_file[1].replace(data_path, args.output_path).replace(".wav", ".npy") for wav_file in wav_files]
 else:
    # if target dataset is not defined
    if len(split_ext) > 0 and split_ext[1].lower() == ".csv":
@ -71,13 +69,8 @@ else:
        # Parse all wav files in data_path
        wav_files = glob.glob(data_path + "/**/*.wav", recursive=True)

-        output_files = [wav_file.replace(data_path, args.output_path).replace(".wav", ".npy") for wav_file in wav_files]
-
-for output_file in output_files:
-    os.makedirs(os.path.dirname(output_file), exist_ok=True)
-
 # define Encoder model
-model = SpeakerEncoder(**c.model)
+model = setup_model(c)
 model.load_state_dict(torch.load(args.model_path)["model"])
 model.eval()
 if args.use_cuda:
@ -89,6 +82,8 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
    if isinstance(wav_file, list):
        speaker_name = wav_file[2]
        wav_file = wav_file[1]
+    else:
+        speaker_name = None

    mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T
    mel_spec = torch.FloatTensor(mel_spec[None, :, :])
@ -96,16 +91,21 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
        mel_spec = mel_spec.cuda()
    embedd = model.compute_embedding(mel_spec)
    embedd = embedd.detach().cpu().numpy()
-    np.save(output_files[idx], embedd)

-    if args.target_dataset != "":
-        # create speaker_mapping if target dataset is defined
-        wav_file_name = os.path.basename(wav_file)
-        speaker_mapping[wav_file_name] = {}
-        speaker_mapping[wav_file_name]["name"] = speaker_name
-        speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist()
+    # create speaker_mapping if target dataset is defined
+    wav_file_name = os.path.basename(wav_file)
+    speaker_mapping[wav_file_name] = {}
+    speaker_mapping[wav_file_name]["name"] = speaker_name
+    speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist()

-if args.target_dataset != "":
+if speaker_mapping:
    # save speaker_mapping if target dataset is defined
-    mapping_file_path = os.path.join(args.output_path, "speakers.json")
-    save_speaker_mapping(args.output_path, speaker_mapping)
+    if '.json' not in args.output_path:
+        mapping_file_path = os.path.join(args.output_path, "speakers.json")
+    else:
+        mapping_file_path = args.output_path
+    os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
+    speaker_manager = SpeakerManager()
+    # pylint: disable=W0212
+    speaker_manager._save_json(mapping_file_path, speaker_mapping)
+    print("Speaker embeddings saved at:", mapping_file_path)
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@ -9,10 +9,11 @@ import traceback
 import torch
 from torch.utils.data import DataLoader

-from TTS.speaker_encoder.dataset import MyDataset
-from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss
-from TTS.speaker_encoder.model import SpeakerEncoder
-from TTS.speaker_encoder.utils.io import save_best_model, save_checkpoint
+from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
+
+from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
+from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model
+
 from TTS.speaker_encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets.preprocess import load_meta_data
 from TTS.utils.arguments import init_training
@ -34,18 +35,19 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
    if is_val:
        loader = None
    else:
-        dataset = MyDataset(
+        dataset = SpeakerEncoderDataset(
            ap,
            meta_data_eval if is_val else meta_data_train,
-            voice_len=1.6,
+            voice_len=c.voice_len,
            num_utter_per_speaker=c.num_utters_per_speaker,
            num_speakers_in_batch=c.num_speakers_in_batch,
-            skip_speakers=False,
+            skip_speakers=c.skip_speakers,
            storage_size=c.storage["storage_size"],
            sample_from_storage_p=c.storage["sample_from_storage_p"],
-            additive_noise=c.storage["additive_noise"],
            verbose=verbose,
+            augmentation_config=c.audio_augmentation
        )
+
        # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
        loader = DataLoader(
            dataset,
@ -54,22 +56,23 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
            num_workers=c.num_loader_workers,
            collate_fn=dataset.collate_fn,
        )
-    return loader
+    return loader, dataset.get_num_speakers()


-def train(model, criterion, optimizer, scheduler, ap, global_step):
-    data_loader = setup_loader(ap, is_val=False, verbose=True)
+def train(model, optimizer, scheduler, criterion, data_loader, global_step):
    model.train()
    epoch_time = 0
    best_loss = float("inf")
    avg_loss = 0
+    avg_loss_all = 0
    avg_loader_time = 0
    end_time = time.time()
+
    for _, data in enumerate(data_loader):
        start_time = time.time()

        # setup input data
-        inputs = data[0]
+        inputs, labels = data
        loader_time = time.time() - end_time
        global_step += 1

@ -81,13 +84,13 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
        # dispatch data to GPU
        if use_cuda:
            inputs = inputs.cuda(non_blocking=True)
-            # labels = labels.cuda(non_blocking=True)
+            labels = labels.cuda(non_blocking=True)

        # forward pass model
        outputs = model(inputs)

        # loss computation
-        loss = criterion(outputs.view(c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1))
+        loss = criterion(outputs.view(c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1), labels)
        loss.backward()
        grad_norm, _ = check_update(model, c.grad_clip)
        optimizer.step()
@ -129,16 +132,17 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
                ),
                flush=True,
            )
+        avg_loss_all += avg_loss

-        # save best model
-        best_loss = save_best_model(model, optimizer, avg_loss, best_loss, OUT_PATH, global_step)
-        end_time = time.time()
-
-        # checkpoint and check stop train cond.
        if global_step >= c.max_train_step or global_step % c.save_step == 0:
-            save_checkpoint(model, optimizer, avg_loss, OUT_PATH, global_step)
+            # save best model only
+            best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step)
+            avg_loss_all = 0
            if global_step >= c.max_train_step:
                break
+
+        end_time = time.time()
+
    return avg_loss, global_step


@ -147,43 +151,48 @@ def main(args):  # pylint: disable=redefined-outer-name
    global meta_data_train
    global meta_data_eval

-    ap = AudioProcessor(**c.audio.to_dict())
-    model = SpeakerEncoder(
-        input_dim=c.model_params["input_dim"],
-        proj_dim=c.model_params["proj_dim"],
-        lstm_dim=c.model_params["lstm_dim"],
-        num_lstm_layers=c.model_params["num_lstm_layers"],
-    )
+    ap = AudioProcessor(**c.audio)
+    model = setup_model(c)
+
    optimizer = RAdam(model.parameters(), lr=c.lr)

+    # pylint: disable=redefined-outer-name
+    meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=False)
+
+    data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True)
+
    if c.loss == "ge2e":
        criterion = GE2ELoss(loss_method="softmax")
    elif c.loss == "angleproto":
        criterion = AngleProtoLoss()
+    elif c.loss == "softmaxproto":
+        criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers)
    else:
        raise Exception("The %s  not is a loss supported" % c.loss)

+
    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            model.load_state_dict(checkpoint["model"])
-        except KeyError:
+
+            if 'criterion' in checkpoint:
+                criterion.load_state_dict(checkpoint["criterion"])
+
+        except (KeyError, RuntimeError):
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
-            model_dict = set_init_dict(model_dict, checkpoint, c)
+            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
            model.load_state_dict(model_dict)
            del model_dict
        for group in optimizer.param_groups:
            group["lr"] = c.lr
+
        print(" > Model restored from step %d" % checkpoint["step"], flush=True)
        args.restore_step = checkpoint["step"]
    else:
        args.restore_step = 0

-    if use_cuda:
-        model = model.cuda()
-        criterion.cuda()
-
    if c.lr_decay:
        scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
    else:
@ -192,11 +201,12 @@ def main(args):  # pylint: disable=redefined-outer-name
    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

-    # pylint: disable=redefined-outer-name
-    meta_data_train, meta_data_eval = load_meta_data(c.datasets)
+    if use_cuda:
+        model = model.cuda()
+        criterion.cuda()

    global_step = args.restore_step
-    _, global_step = train(model, criterion, optimizer, scheduler, ap, global_step)
+    _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step)


 if __name__ == "__main__":
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@ -226,7 +226,7 @@ class BaseTrainingConfig(Coqpit):
    run_description: str = ""
    # training params
    epochs: int = 10000
-    batch_size: int = MISSING
+    batch_size: int = None
    eval_batch_size: int = None
    mixed_precision: bool = False
    # eval params
--- a/TTS/speaker_encoder/configs/config.json
+++ b/TTS/speaker_encoder/configs/config.json
@ -0,0 +1,118 @@
+
+{
+    "model_name": "lstm",
+    "run_name": "mueller91",
+    "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
+    "audio":{
+        // Audio processing parameters
+        "num_mels": 40,         // size of the mel spec frame. 
+        "fft_size": 400,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "win_length": 400,     // stft window length in ms.
+        "hop_length": 160,      // stft window hop-lengh in ms.
+        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "min_level_db": -100,   // normalization range
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        // Normalization parameters
+        "signal_norm": true,    // normalize the spec values in range [0, 1]
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
+    },
+    "reinit_layers": [],
+    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
+    "grad_clip": 3.0, // upper limit for gradients for clipping.
+    "epochs": 1000, // total number of epochs to train.
+    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
+    "steps_plot_stats": 10, // number of steps to plot embeddings.
+    "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
+    "num_utters_per_speaker": 10,  //
+    "skip_speakers": false, // skip speakers with samples less than "num_utters_per_speaker"
+
+    "voice_len": 1.6, // number of seconds for each training instance
+    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "wd": 0.000001, // Weight decay weight.
+    "checkpoint": true, // If true, it saves checkpoints per "save_step"
+    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
+    "print_step": 20, // Number of steps to log traning on console.
+    "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
+    "model": {
+        "input_dim": 40,
+        "proj_dim": 256,
+        "lstm_dim": 768,
+        "num_lstm_layers": 3,
+        "use_lstm_with_projection": true
+    },
+
+    "audio_augmentation": {
+        "p": 0,
+        //add a gaussian noise to the data in order to increase robustness
+        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
+            "p": 1, // propability of apply this method, 0 is disable
+            "min_amplitude": 0.0,
+            "max_amplitude": 1e-5     
+        }
+    },
+    "storage": {
+        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
+        "storage_size": 15,   // the size of the in-memory storage with respect to a single batch
+        "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
+    },
+    "datasets": 
+        [
+            {
+                "name": "vctk_slim",
+                "path": "../../../audio-datasets/en/VCTK-Corpus/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "libri_tts",
+                "path": "../../../audio-datasets/en/LibriTTS/train-other-500",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb1",
+                "path": "../../../audio-datasets/en/voxceleb1/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "voxceleb2",
+                "path": "../../../audio-datasets/en/voxceleb2/",
+                "meta_file_train": null,
+                "meta_file_val": null
+            },
+            {
+                "name": "common_voice",
+                "path": "../../../audio-datasets/en/MozillaCommonVoice",
+                "meta_file_train": "train.tsv",
+                "meta_file_val": "test.tsv"
+            }
+        ]
+}
--- a/TTS/speaker_encoder/configs/config_resnet_angleproto.json
+++ b/TTS/speaker_encoder/configs/config_resnet_angleproto.json
@ -0,0 +1,956 @@
+{
+    "model": "speaker_encoder",
+    "run_name": "speaker_encoder",
+    "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
+    // AUDIO PARAMETERS
+    "audio":{
+        // Audio processing parameters
+        "num_mels": 80,         // size of the mel spec frame. 
+        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "sample_rate": 16000,  // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "win_length": 1024,     // stft window length in ms.
+        "hop_length": 256,      // stft window hop-lengh in ms.
+        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "min_level_db": -100,   // normalization range
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        "stft_pad_mode": "reflect",
+        // Normalization parameters
+        "signal_norm": true,    // normalize the spec values in range [0, 1]
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 20.0, 
+        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored 
+    },
+    "reinit_layers": [],
+
+    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss 
+    "grad_clip": 3.0, // upper limit for gradients for clipping.
+    "max_train_step": 1000000, // total number of steps to train.
+    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
+    "steps_plot_stats": 100, // number of steps to plot embeddings.
+
+    // Speakers config
+    "num_speakers_in_batch": 200, // Batch size for training.
+    "num_utters_per_speaker": 2,  //
+    "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker"
+    "voice_len": 2, // number of seconds for each training instance
+     
+    "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "wd": 0.000001, // Weight decay weight.
+    "checkpoint": true, // If true, it saves checkpoints per "save_step"
+    "save_step": 1000, // Number of training steps expected to save the best checkpoints in training.
+    "print_step": 50, // Number of steps to log traning on console.
+    "output_path": "../checkpoints/speaker_encoder/angleproto/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto/", // DATASET-RELATED: output path for all training outputs.
+
+    "audio_augmentation": {
+        "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation
+        "rir":{
+            "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
+            "conv_mode": "full"
+        },
+        "additive":{
+            "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
+            // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored
+            "speech":{
+                "min_snr_in_db": 13,
+                "max_snr_in_db": 20,
+                "min_num_noises": 2,
+                "max_num_noises": 3
+                },
+            "noise":{
+                "min_snr_in_db": 0,
+                "max_snr_in_db": 15,
+                "min_num_noises": 1,
+                "max_num_noises": 1
+                },
+            "music":{
+                "min_snr_in_db": 5,
+                "max_snr_in_db": 15,
+                "min_num_noises": 1,
+                "max_num_noises": 1
+                }
+        },
+        //add a gaussian noise to the data in order to increase robustness
+        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
+            "p": 0.5, // propability of apply this method, 0 is disable
+            "min_amplitude": 0.0,
+            "max_amplitude": 1e-5    
+        }
+    },
+    "model_params": {
+        "model_name": "resnet",
+        "input_dim": 80,
+        "proj_dim": 512
+    },
+    "storage": {
+        "sample_from_storage_p": 0.5,  // the probability with which we'll sample from the DataSet in-memory storage
+        "storage_size":  35 // the size of the in-memory storage with respect to a single batch
+    },
+    "datasets": 
+        [
+        {
+                "name": "voxceleb2",
+                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
+                "meta_file_train": null,
+                "meta_file_val": null
+        },
+        {
+                "name": "voxceleb1",
+                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/",
+                "meta_file_train": null,
+                "meta_file_val": null
+        },
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        }
+
+        ]
+}
--- a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json
+++ b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json
@ -0,0 +1,957 @@
+
+{
+    "model": "speaker_encoder",
+    "run_name": "speaker_encoder",
+    "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
+    // AUDIO PARAMETERS
+    "audio":{
+        // Audio processing parameters
+        "num_mels": 80,         // size of the mel spec frame. 
+        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
+        "sample_rate": 16000,  // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
+        "win_length": 1024,     // stft window length in ms.
+        "hop_length": 256,      // stft window hop-lengh in ms.
+        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
+        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
+        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
+        "min_level_db": -100,   // normalization range
+        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
+        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
+        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        "stft_pad_mode": "reflect",
+        // Normalization parameters
+        "signal_norm": true,    // normalize the spec values in range [0, 1]
+        "symmetric_norm": true, // move normalization to range [-1, 1]
+        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
+        "clip_norm": true,      // clip normalized values into the range.
+        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
+        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
+        "spec_gain": 20.0, 
+        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
+        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored 
+    },
+    "reinit_layers": [],
+
+    "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss 
+    "grad_clip": 3.0, // upper limit for gradients for clipping.
+    "max_train_step": 1000000, // total number of steps to train.
+    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
+    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
+    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
+    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
+    "steps_plot_stats": 100, // number of steps to plot embeddings.
+
+    // Speakers config
+    "num_speakers_in_batch": 200, // Batch size for training.
+    "num_utters_per_speaker": 2,  //
+    "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker"
+    "voice_len": 2, // number of seconds for each training instance
+     
+    "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
+    "wd": 0.000001, // Weight decay weight.
+    "checkpoint": true, // If true, it saves checkpoints per "save_step"
+    "save_step": 1000, // Number of training steps expected to save the best checkpoints in training.
+    "print_step": 50, // Number of steps to log traning on console.
+    "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all/", // DATASET-RELATED: output path for all training outputs.
+
+    "audio_augmentation": {
+        "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation
+        "rir":{
+            "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
+            "conv_mode": "full"
+        },
+        "additive":{
+            "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
+            // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored
+            "speech":{
+                "min_snr_in_db": 13,
+                "max_snr_in_db": 20,
+                "min_num_noises": 2,
+                "max_num_noises": 3
+                },
+            "noise":{
+                "min_snr_in_db": 0,
+                "max_snr_in_db": 15,
+                "min_num_noises": 1,
+                "max_num_noises": 1
+                },
+            "music":{
+                "min_snr_in_db": 5,
+                "max_snr_in_db": 15,
+                "min_num_noises": 1,
+                "max_num_noises": 1
+                }
+        },
+        //add a gaussian noise to the data in order to increase robustness
+        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
+            "p": 0.5, // propability of apply this method, 0 is disable
+            "min_amplitude": 0.0,
+            "max_amplitude": 1e-5    
+        }
+    },
+    "model_params": {
+        "model_name": "resnet",
+        "input_dim": 80,
+        "proj_dim": 512
+    },
+    "storage": {
+        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
+        "storage_size":  35 // the size of the in-memory storage with respect to a single batch
+    },
+    "datasets": 
+        [
+        {
+                "name": "voxceleb2",
+                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
+                "meta_file_train": null,
+                "meta_file_val": null
+        },
+        {
+                "name": "voxceleb1",
+                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/",
+                "meta_file_train": null,
+                "meta_file_val": null
+        },
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        },
+
+         {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
+                "meta_file_train": "train.tsv",
+                "meta_file_val":  null
+        },
+
+        {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
+                "meta_file_train": "dev.tsv",
+                "meta_file_val":  null
+        }
+
+        ]
+}
--- a/TTS/speaker_encoder/dataset.py
+++ b/TTS/speaker_encoder/dataset.py
@ -1,24 +1,24 @@
-import queue
+
 import random

 import numpy as np
 import torch
 from torch.utils.data import Dataset
+from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage

-
-class MyDataset(Dataset):
+class SpeakerEncoderDataset(Dataset):
    def __init__(
-        self,
-        ap,
-        meta_data,
-        voice_len=1.6,
-        num_speakers_in_batch=64,
-        storage_size=1,
-        sample_from_storage_p=0.5,
-        additive_noise=0,
-        num_utter_per_speaker=10,
-        skip_speakers=False,
-        verbose=False,
+            self,
+            ap,
+            meta_data,
+            voice_len=1.6,
+            num_speakers_in_batch=64,
+            storage_size=1,
+            sample_from_storage_p=0.5,
+            num_utter_per_speaker=10,
+            skip_speakers=False,
+            verbose=False,
+            augmentation_config=None
    ):
        """
        Args:
@ -30,7 +30,6 @@ class MyDataset(Dataset):
        super().__init__()
        self.items = meta_data
        self.sample_rate = ap.sample_rate
-        self.voice_len = voice_len
        self.seq_len = int(voice_len * self.sample_rate)
        self.num_speakers_in_batch = num_speakers_in_batch
        self.num_utter_per_speaker = num_utter_per_speaker
@ -38,15 +37,30 @@ class MyDataset(Dataset):
        self.ap = ap
        self.verbose = verbose
        self.__parse_items()
-        self.storage = queue.Queue(maxsize=storage_size * num_speakers_in_batch)
+        storage_max_size = storage_size * num_speakers_in_batch
+        self.storage = Storage(maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch)
        self.sample_from_storage_p = float(sample_from_storage_p)
-        self.additive_noise = float(additive_noise)
+
+        speakers_aux = list(self.speakers)
+        speakers_aux.sort()
+        self.speakerid_to_classid = {key : i for i, key in enumerate(speakers_aux)}
+
+        # Augmentation
+        self.augmentator = None
+        self.gaussian_augmentation_config = None
+        if augmentation_config:
+            self.data_augmentation_p = augmentation_config['p']
+            if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config):
+                self.augmentator = AugmentWAV(ap, augmentation_config)
+
+            if 'gaussian' in augmentation_config.keys():
+                self.gaussian_augmentation_config = augmentation_config['gaussian']
+
        if self.verbose:
            print("\n > DataLoader initialization")
            print(f" | > Speakers per Batch: {num_speakers_in_batch}")
-            print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters")
+            print(f" | > Storage Size: {storage_max_size} instances, each with {num_utter_per_speaker} utters")
            print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
-            print(f" | > Noise added : {self.additive_noise}")
            print(f" | > Number of instances : {len(self.items)}")
            print(f" | > Sequence length: {self.seq_len}")
            print(f" | > Num speakers: {len(self.speakers)}")
@ -90,28 +104,21 @@ class MyDataset(Dataset):

        self.speakers = [k for (k, v) in self.speaker_to_utters.items()]

-    # def __parse_items(self):
-    #     """
-    #     Find unique speaker ids and create a dict mapping utterances from speaker id
-    #     """
-    #     speakers = list({item[-1] for item in self.items})
-    #     self.speaker_to_utters = {}
-    #     self.speakers = []
-    #     for speaker in speakers:
-    #         speaker_utters = [item[1] for item in self.items if item[2] == speaker]
-    #         if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers:
-    #             print(
-    #                 f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}."
-    #             )
-    #         else:
-    #             self.speakers.append(speaker)
-    #             self.speaker_to_utters[speaker] = speaker_utters
-
    def __len__(self):
        return int(1e10)

-    def __sample_speaker(self):
+    def get_num_speakers(self):
+        return len(self.speakers)
+
+    def __sample_speaker(self, ignore_speakers=None):
        speaker = random.sample(self.speakers, 1)[0]
+        # if list of speakers_id is provide make sure that it's will be ignored
+        if ignore_speakers and self.speakerid_to_classid[speaker] in ignore_speakers:
+            while True:
+                speaker = random.sample(self.speakers, 1)[0]
+                if self.speakerid_to_classid[speaker] not in ignore_speakers:
+                    break
+
        if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]):
            utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker)
        else:
@ -127,54 +134,113 @@ class MyDataset(Dataset):
        for _ in range(self.num_utter_per_speaker):
            # TODO:dummy but works
            while True:
-                if len(self.speaker_to_utters[speaker]) > 0:
+                # remove speakers that have num_utter less than 2
+                if len(self.speaker_to_utters[speaker]) > 1:
                    utter = random.sample(self.speaker_to_utters[speaker], 1)[0]
                else:
-                    self.speakers.remove(speaker)
+                    if speaker in self.speakers:
+                        self.speakers.remove(speaker)
+
                    speaker, _ = self.__sample_speaker()
                    continue
+
                wav = self.load_wav(utter)
                if wav.shape[0] - self.seq_len > 0:
                    break
-                self.speaker_to_utters[speaker].remove(utter)
+
+                if utter in self.speaker_to_utters[speaker]:
+                    self.speaker_to_utters[speaker].remove(utter)
+
+            if self.augmentator is not None and self.data_augmentation_p:
+                if random.random() < self.data_augmentation_p:
+                    wav = self.augmentator.apply_one(wav)

            wavs.append(wav)
-            labels.append(speaker)
+            labels.append(self.speakerid_to_classid[speaker])
        return wavs, labels

    def __getitem__(self, idx):
        speaker, _ = self.__sample_speaker()
-        return speaker
+        speaker_id = self.speakerid_to_classid[speaker]
+        return speaker, speaker_id
+
+    def __load_from_disk_and_storage(self, speaker):
+        # don't sample from storage, but from HDD
+        wavs_, labels_ = self.__sample_speaker_utterances(speaker)
+        # put the newly loaded item into storage
+        self.storage.append((wavs_, labels_))
+        return wavs_, labels_

    def collate_fn(self, batch):
+        # get the batch speaker_ids
+        batch = np.array(batch)
+        speakers_id_in_batch = set(batch[:, 1].astype(np.int32))
+
        labels = []
        feats = []
-        for speaker in batch:
+        speakers = set()
+
+        for speaker, speaker_id in batch:
+            speaker_id = int(speaker_id)
+
+            # ensure that an speaker appears only once in the batch
+            if speaker_id in speakers:
+
+                # remove current speaker
+                if speaker_id in speakers_id_in_batch:
+                    speakers_id_in_batch.remove(speaker_id)
+
+                speaker, _ = self.__sample_speaker(ignore_speakers=speakers_id_in_batch)
+                speaker_id = self.speakerid_to_classid[speaker]
+                speakers_id_in_batch.add(speaker_id)
+
            if random.random() < self.sample_from_storage_p and self.storage.full():
-                # sample from storage (if full), ignoring the speaker
-                wavs_, labels_ = random.choice(self.storage.queue)
+                # sample from storage (if full)
+                wavs_, labels_ = self.storage.get_random_sample_fast()
+
+                # force choose the current speaker or other not in batch
+                # It's necessary for ideal training with AngleProto and GE2E losses
+                if labels_[0] in speakers_id_in_batch and labels_[0] != speaker_id:
+                    attempts = 0
+                    while True:
+                        wavs_, labels_ = self.storage.get_random_sample_fast()
+                        if labels_[0] == speaker_id or labels_[0] not in speakers_id_in_batch:
+                            break
+
+                        attempts += 1
+                        # Try 5 times after that load from disk
+                        if attempts >= 5:
+                            wavs_, labels_ = self.__load_from_disk_and_storage(speaker)
+                            break
            else:
                # don't sample from storage, but from HDD
-                wavs_, labels_ = self.__sample_speaker_utterances(speaker)
-                # if storage is full, remove an item
-                if self.storage.full():
-                    _ = self.storage.get_nowait()
-                # put the newly loaded item into storage
-                self.storage.put_nowait((wavs_, labels_))
+                wavs_, labels_ = self.__load_from_disk_and_storage(speaker)

-            # add random gaussian noise
-            if self.additive_noise > 0:
-                noises_ = [np.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_]
-                wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))]
+            # append speaker for control
+            speakers.add(labels_[0])

-            # get a random subset of each of the wavs and convert to MFCC.
-            offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_]
-            mels_ = [
-                self.ap.melspectrogram(wavs_[i][offsets_[i] : offsets_[i] + self.seq_len]) for i in range(len(wavs_))
-            ]
-            feats_ = [torch.FloatTensor(mel) for mel in mels_]
+            # remove current speaker and append other
+            if speaker_id in speakers_id_in_batch:
+                speakers_id_in_batch.remove(speaker_id)

-            labels.append(labels_)
+            speakers_id_in_batch.add(labels_[0])
+
+            # get a random subset of each of the wavs and extract mel spectrograms.
+            feats_ = []
+            for wav in wavs_:
+                offset = random.randint(0, wav.shape[0] - self.seq_len)
+                wav = wav[offset : offset + self.seq_len]
+                # add random gaussian noise
+                if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']:
+                    if random.random() < self.gaussian_augmentation_config['p']:
+                        wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav))
+                mel = self.ap.melspectrogram(wav)
+                feats_.append(torch.FloatTensor(mel))
+
+            labels.append(torch.LongTensor(labels_))
            feats.extend(feats_)
+
        feats = torch.stack(feats)
+        labels = torch.stack(labels)
+
        return feats.transpose(1, 2), labels
--- a/TTS/speaker_encoder/losses.py
+++ b/TTS/speaker_encoder/losses.py
@ -103,15 +103,18 @@ class GE2ELoss(nn.Module):
            L.append(L_row)
        return torch.stack(L)

-    def forward(self, dvecs):
+    def forward(self, x, _label=None):
        """
        Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
        """
-        centroids = torch.mean(dvecs, 1)
-        cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids)
+
+        assert x.size()[1] >= 2
+
+        centroids = torch.mean(x, 1)
+        cos_sim_matrix = self.calc_cosine_sim(x, centroids)
        torch.clamp(self.w, 1e-6)
        cos_sim_matrix = self.w * cos_sim_matrix + self.b
-        L = self.embed_loss(dvecs, cos_sim_matrix)
+        L = self.embed_loss(x, cos_sim_matrix)
        return L.mean()


@ -138,10 +141,13 @@ class AngleProtoLoss(nn.Module):

        print(" > Initialized Angular Prototypical loss")

-    def forward(self, x):
+    def forward(self, x, _label=None):
        """
        Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
        """
+
+        assert x.size()[1] >= 2
+
        out_anchor = torch.mean(x[:, 1:, :], 1)
        out_positive = x[:, 0, :]
        num_speakers = out_anchor.size()[0]
@ -155,3 +161,56 @@ class AngleProtoLoss(nn.Module):
        label = torch.arange(num_speakers).to(cos_sim_matrix.device)
        L = self.criterion(cos_sim_matrix, label)
        return L
+
+class SoftmaxLoss(nn.Module):
+    """
+    Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
+        Args:
+            - embedding_dim (float): speaker embedding dim
+            - n_speakers (float): number of speakers
+    """
+    def __init__(self, embedding_dim, n_speakers):
+        super().__init__()
+
+        self.criterion = torch.nn.CrossEntropyLoss()
+        self.fc = nn.Linear(embedding_dim, n_speakers)
+
+        print('Initialised Softmax Loss')
+
+    def forward(self, x, label=None):
+        # reshape for compatibility
+        x = x.reshape(-1, x.size()[-1])
+        label = label.reshape(-1)
+
+        x = self.fc(x)
+        L = self.criterion(x, label)
+
+        return L
+
+class SoftmaxAngleProtoLoss(nn.Module):
+    """
+    Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
+        Args:
+            - embedding_dim (float): speaker embedding dim
+            - n_speakers (float): number of speakers
+            - init_w (float): defines the initial value of w
+            - init_b (float): definies the initial value of b
+    """
+    def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
+        super().__init__()
+
+        self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
+        self.angleproto = AngleProtoLoss(init_w, init_b)
+
+        print('Initialised SoftmaxAnglePrototypical Loss')
+
+    def forward(self, x, label=None):
+        """
+        Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+        """
+
+        Lp = self.angleproto(x)
+
+        Ls = self.softmax(x, label)
+
+        return Ls+Lp
--- a/TTS/speaker_encoder/models/lstm.py
+++ b/TTS/speaker_encoder/models/lstm.py
@ -29,7 +29,7 @@ class LSTMWithoutProjection(nn.Module):
        return self.relu(self.linear(hidden[-1]))


-class SpeakerEncoder(nn.Module):
+class LSTMSpeakerEncoder(nn.Module):
    def __init__(self, input_dim, proj_dim=256, lstm_dim=768, num_lstm_layers=3, use_lstm_with_projection=True):
        super().__init__()
        self.use_lstm_with_projection = use_lstm_with_projection
--- a/TTS/speaker_encoder/models/resnet.py
+++ b/TTS/speaker_encoder/models/resnet.py
@ -0,0 +1,190 @@
+import torch
+import numpy as np
+import torch.nn as nn
+
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=8):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel),
+            nn.Sigmoid()
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+
+class SEBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes, reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+        return out
+
+class ResNetSpeakerEncoder(nn.Module):
+    """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
+    Adapted from: https://github.com/clovaai/voxceleb_trainer
+    """
+    # pylint: disable=W0102
+    def __init__(self, input_dim=64, proj_dim=512, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], encoder_type='ASP', log_input=False):
+        super(ResNetSpeakerEncoder, self).__init__()
+
+        self.encoder_type = encoder_type
+        self.input_dim = input_dim
+        self.log_input = log_input
+        self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.bn1 = nn.BatchNorm2d(num_filters[0])
+
+        self.inplanes = num_filters[0]
+        self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
+        self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
+        self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
+        self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
+
+        self.instancenorm = nn.InstanceNorm1d(input_dim)
+
+        outmap_size = int(self.input_dim/8)
+
+        self.attention = nn.Sequential(
+            nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
+            nn.ReLU(),
+            nn.BatchNorm1d(128),
+            nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
+            nn.Softmax(dim=2),
+            )
+
+        if self.encoder_type == "SAP":
+            out_dim = num_filters[3] * outmap_size
+        elif self.encoder_type == "ASP":
+            out_dim = num_filters[3] * outmap_size * 2
+        else:
+            raise ValueError('Undefined encoder')
+
+        self.fc = nn.Linear(out_dim, proj_dim)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def create_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    # pylint: disable=R0201
+    def new_parameter(self, *size):
+        out = nn.Parameter(torch.FloatTensor(*size))
+        nn.init.xavier_normal_(out)
+        return out
+
+    def forward(self, x, l2_norm=False):
+        x = x.transpose(1, 2)
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                if self.log_input:
+                    x = (x+1e-6).log()
+                x = self.instancenorm(x).unsqueeze(1)
+
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.bn1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = x.reshape(x.size()[0], -1, x.size()[-1])
+
+        w = self.attention(x)
+
+        if self.encoder_type == "SAP":
+            x = torch.sum(x * w, dim=2)
+        elif self.encoder_type == "ASP":
+            mu = torch.sum(x * w, dim=2)
+            sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu ** 2).clamp(min=1e-5))
+            x = torch.cat((mu, sg), 1)
+
+        x = x.view(x.size()[0], -1)
+        x = self.fc(x)
+
+        if l2_norm:
+            x = torch.nn.functional.normalize(x, p=2, dim=1)
+        return x
+
+    @torch.no_grad()
+    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
+        """
+        Generate embeddings for a batch of utterances
+        x: 1xTxD
+        """
+        max_len = x.shape[1]
+
+        if max_len < num_frames:
+            num_frames = max_len
+
+        offsets = np.linspace(0, max_len-num_frames, num=num_eval)
+
+        frames_batch = []
+        for offset in offsets:
+            offset = int(offset)
+            end_offset = int(offset+num_frames)
+            frames = x[:, offset:end_offset]
+            frames_batch.append(frames)
+
+        frames_batch = torch.cat(frames_batch, dim=0)
+        embeddings = self.forward(frames_batch, l2_norm=True)
+
+        if return_mean:
+            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
+
+        return embeddings
--- a/TTS/speaker_encoder/speaker_encoder_config.py
+++ b/TTS/speaker_encoder/speaker_encoder_config.py
@ -13,11 +13,11 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
    model: str = "speaker_encoder"
    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
-
    # model params
    model_params: dict = field(
        default_factory=lambda: {
-            "input_dim": 40,
+            "model_name": "lstm",
+            "input_dim": 80,
            "proj_dim": 256,
            "lstm_dim": 768,
            "num_lstm_layers": 3,
@ -25,16 +25,20 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
        }
    )

+    audio_augmentation : dict = field(
+        default_factory=lambda: {
+        }
+    )
+
    storage: dict = field(
        default_factory=lambda: {
            "sample_from_storage_p": 0.66,  # the probability with which we'll sample from the DataSet in-memory storage
            "storage_size": 15,  # the size of the in-memory storage with respect to a single batch
-            "additive_noise": 1e-5,  # add very small gaussian noise to the data in order to increase robustness
        }
    )

    # training params
-    max_train_step: int = 1000  # end training when number of training steps reaches this value.
+    max_train_step: int = 1000000  # end training when number of training steps reaches this value.
    loss: str = "angleproto"
    grad_clip: float = 3.0
    lr: float = 0.0001
@ -53,6 +57,8 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
    num_speakers_in_batch: int = MISSING
    num_utters_per_speaker: int = MISSING
    num_loader_workers: int = MISSING
+    skip_speakers: bool = False
+    voice_len: float = 1.6

    def check_values(self):
        super().check_values()
--- a/TTS/speaker_encoder/utils/generic_utils.py
+++ b/TTS/speaker_encoder/utils/generic_utils.py
@ -1,18 +1,201 @@
 import re
+import os

-from TTS.speaker_encoder.model import SpeakerEncoder
+import numpy as np
+import torch
+import glob
+import random
+import datetime

+from scipy import signal
+from multiprocessing import Manager
+
+from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
+from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
+
+class Storage(object):
+    def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8):
+        # use multiprocessing for threading safe
+        self.storage = Manager().list()
+        self.maxsize = maxsize
+        self.num_speakers_in_batch = num_speakers_in_batch
+        self.num_threads = num_threads
+        self.ignore_last_batch = False
+
+        if storage_batchs >= 3:
+            self.ignore_last_batch = True
+
+        # used for fast random sample
+        self.safe_storage_size = self.maxsize - self.num_threads
+        if self.ignore_last_batch:
+            self.safe_storage_size -= self.num_speakers_in_batch
+
+    def __len__(self):
+        return len(self.storage)
+
+    def full(self):
+        return len(self.storage) >= self.maxsize
+
+    def append(self, item):
+        # if storage is full, remove an item
+        if self.full():
+            self.storage.pop(0)
+
+        self.storage.append(item)
+
+    def get_random_sample(self):
+        # safe storage size considering all threads remove one item from storage in same time
+        storage_size = len(self.storage) - self.num_threads
+
+        if self.ignore_last_batch:
+            storage_size -= self.num_speakers_in_batch
+
+        return self.storage[random.randint(0, storage_size)]
+
+    def get_random_sample_fast(self):
+        '''Call this method only when storage is full'''
+        return self.storage[random.randint(0, self.safe_storage_size)]
+
+class AugmentWAV(object):
+
+    def __init__(self, ap, augmentation_config):
+
+        self.ap = ap
+        self.use_additive_noise = False
+
+        if 'additive' in augmentation_config.keys():
+            self.additive_noise_config = augmentation_config['additive']
+            additive_path = self.additive_noise_config['sounds_path']
+            if additive_path:
+                self.use_additive_noise = True
+                # get noise types
+                self.additive_noise_types = []
+                for key in self.additive_noise_config.keys():
+                    if isinstance(self.additive_noise_config[key], dict):
+                        self.additive_noise_types.append(key)
+
+                additive_files = glob.glob(os.path.join(additive_path, '**/*.wav'), recursive=True)
+
+                self.noise_list = {}
+
+                for wav_file in additive_files:
+                    noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0]
+                    # ignore not listed directories
+                    if noise_dir not in self.additive_noise_types:
+                        continue
+                    if not noise_dir in self.noise_list:
+                        self.noise_list[noise_dir] = []
+                    self.noise_list[noise_dir].append(wav_file)
+
+                print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}")
+
+        self.use_rir = False
+
+        if 'rir' in augmentation_config.keys():
+            self.rir_config = augmentation_config['rir']
+            if self.rir_config['rir_path']:
+                self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'], '**/*.wav'), recursive=True)
+                self.use_rir = True
+
+            print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
+
+        self.create_augmentation_global_list()
+
+    def create_augmentation_global_list(self):
+        if self.use_additive_noise:
+            self.global_noise_list = self.additive_noise_types
+        else:
+            self.global_noise_list = []
+        if self.use_rir:
+            self.global_noise_list.append("RIR_AUG")
+
+    def additive_noise(self, noise_type, audio):
+
+        clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
+
+        noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises']))
+
+        audio_len = audio.shape[0]
+        noises_wav = None
+        for noise in noise_list:
+            noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
+
+            if noiseaudio.shape[0] < audio_len:
+                continue
+
+            noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises'])
+            noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
+            noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
+
+            if noises_wav is None:
+                noises_wav = noise_wav
+            else:
+                noises_wav += noise_wav
+
+        # if all possible files is less than audio, choose other files
+        if noises_wav is None:
+            return self.additive_noise(noise_type, audio)
+
+        return audio + noises_wav
+
+    def reverberate(self, audio):
+        audio_len = audio.shape[0]
+
+        rir_file = random.choice(self.rir_files)
+        rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
+        rir = rir / np.sqrt(np.sum(rir ** 2))
+        return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len]
+
+    def apply_one(self, audio):
+        noise_type = random.choice(self.global_noise_list)
+        if noise_type == "RIR_AUG":
+            return self.reverberate(audio)
+
+        return self.additive_noise(noise_type, audio)

 def to_camel(text):
    text = text.capitalize()
    return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)

-
 def setup_model(c):
-    model = SpeakerEncoder(
-        c.model_params["input_dim"],
-        c.model_params["proj_dim"],
-        c.model_params["lstm_dim"],
-        c.model_params["num_lstm_layers"],
-    )
+    if c.model_params['model_name'].lower() == 'lstm':
+        model = LSTMSpeakerEncoder(c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"])
+    elif c.model_params['model_name'].lower() == 'resnet':
+        model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"])
    return model
+
+def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
+    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = os.path.join(out_path, checkpoint_path)
+    print(" | | > Checkpoint saving : {}".format(checkpoint_path))
+
+    new_state_dict = model.state_dict()
+    state = {
+        "model": new_state_dict,
+        "optimizer": optimizer.state_dict() if optimizer is not None else None,
+        "criterion": criterion.state_dict(),
+        "step": current_step,
+        "epoch": epoch,
+        "loss": model_loss,
+        "date": datetime.date.today().strftime("%B %d, %Y"),
+    }
+    torch.save(state, checkpoint_path)
+
+
+def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step):
+    if model_loss < best_loss:
+        new_state_dict = model.state_dict()
+        state = {
+            "model": new_state_dict,
+            "optimizer": optimizer.state_dict(),
+            "criterion": criterion.state_dict(),
+            "step": current_step,
+            "loss": model_loss,
+            "date": datetime.date.today().strftime("%B %d, %Y"),
+        }
+        best_loss = model_loss
+        bestmodel_path = "best_model.pth.tar"
+        bestmodel_path = os.path.join(out_path, bestmodel_path)
+        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
+        torch.save(state, bestmodel_path)
+    return best_loss
--- a/TTS/speaker_encoder/utils/prepare_voxceleb.py
+++ b/TTS/speaker_encoder/utils/prepare_voxceleb.py
@ -31,23 +31,23 @@ from absl import logging

 SUBSETS = {
    "vox1_dev_wav": [
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
    ],
-    "vox1_test_wav": ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
+    "vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
    "vox2_dev_aac": [
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
-        "http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
+        "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
    ],
-    "vox2_test_aac": ["http://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
+    "vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
 }

 MD5SUM = {
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -80,12 +80,12 @@ class CharactersConfig(Coqpit):
    ):
        """Check config fields"""
        c = asdict(self)
-        check_argument("pad", c, "characters", restricted=True)
-        check_argument("eos", c, "characters", restricted=True)
-        check_argument("bos", c, "characters", restricted=True)
-        check_argument("characters", c, "characters", restricted=True)
+        check_argument("pad", c, prerequest="characters", restricted=True)
+        check_argument("eos", c, prerequest="characters", restricted=True)
+        check_argument("bos", c, prerequest="characters", restricted=True)
+        check_argument("characters", c, prerequest="characters", restricted=True)
        check_argument("phonemes", c, restricted=True)
-        check_argument("punctuations", c, "characters", restricted=True)
+        check_argument("punctuations", c, prerequest="characters", restricted=True)


@dataclass
--- a/ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/ExtractSpeakerEmbeddings-by-sample.ipynb
@ -1,163 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n",
-    "\n",
-    "Before running this script please DON'T FORGET: \n",
-    "- to set file paths.\n",
-    "- to download related model files from TTS.\n",
-    "- download or clone related repos, linked below.\n",
-    "- setup the repositories. ```python setup.py install```\n",
-    "- to checkout right commit versions (given next to the model) of TTS.\n",
-    "- to set the right paths in the cell below.\n",
-    "\n",
-    "Repository:\n",
-    "- TTS: https://github.com/mozilla/TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "import os\n",
-    "import importlib\n",
-    "import random\n",
-    "import librosa\n",
-    "import torch\n",
-    "\n",
-    "import numpy as np\n",
-    "from tqdm import tqdm\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "\n",
-    "# you may need to change this depending on your system\n",
-    "os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
-    "\n",
-    "\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "from TTS.utils.audio import AudioProcessor\n",
-    "from TTS.utils.io import load_config"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You should also adjust all the path constants to point at the relevant locations for you locally"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
-    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
-    "\n",
-    "\n",
-    "DATASETS_NAME = ['vctk'] # list the datasets\n",
-    "DATASETS_PATH = ['../../../datasets/VCTK/']\n",
-    "DATASETS_METAFILE = ['']\n",
-    "\n",
-    "USE_CUDA = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Preprocess dataset\n",
-    "meta_data = []\n",
-    "for i in range(len(DATASETS_NAME)):\n",
-    "    preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
-    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
-    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
-    "      \n",
-    "meta_data= list(meta_data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "c = load_config(CONFIG_PATH)\n",
-    "ap = AudioProcessor(**c['audio'])\n",
-    "\n",
-    "model = SpeakerEncoder(**c.model)\n",
-    "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
-    "model.eval()\n",
-    "if USE_CUDA:\n",
-    "    model.cuda()\n",
-    "\n",
-    "embeddings_dict = {}\n",
-    "len_meta_data= len(meta_data)\n",
-    "\n",
-    "for i in tqdm(range(len_meta_data)):\n",
-    "    _, wav_file, speaker_id = meta_data[i]\n",
-    "    wav_file_name = os.path.basename(wav_file)\n",
-    "    mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
-    "    mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
-    "    if USE_CUDA:\n",
-    "        mel_spec = mel_spec.cuda()\n",
-    "    embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
-    "    embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create and export speakers.json\n",
-    "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
-    "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#test load integrity\n",
-    "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
-    "assert speaker_mapping == speaker_mapping_load\n",
-    "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb
@ -1,212 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This is a noteboook used to generate the speaker embeddings with the CorentinJ GE2E model trained with Angular Prototypical loss for multi-speaker training.\n",
-    "\n",
-    "Before running this script please DON'T FORGET:\n",
-    "- to set the right paths in the cell below.\n",
-    "\n",
-    "Repositories:\n",
-    "- TTS: https://github.com/coqui/TTS\n",
-    "- CorentinJ GE2E: https://github.com/Edresson/GE2E-Speaker-Encoder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import importlib\n",
-    "import random\n",
-    "import librosa\n",
-    "import torch\n",
-    "\n",
-    "import numpy as np\n",
-    "from TTS.utils.io import load_config\n",
-    "from tqdm import tqdm\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "\n",
-    "# you may need to change this depending on your system\n",
-    "os.environ['CUDA_VISIBLE_DEVICES']='0'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Clone encoder \n",
-    "!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n",
-    "os.chdir('Real-Time-Voice-Cloning/')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Install voxceleb_trainer Requeriments\n",
-    "!python -m pip install umap-learn visdom webrtcvad librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0  scipy>=1.0.0  tqdm sounddevice Unidecode inflect multiprocess numba"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Download encoder Checkpoint\n",
-    "!wget https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n",
-    "!unzip pretrained.zip"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from encoder import inference as encoder\n",
-    "from encoder.params_model import model_embedding_size as speaker_embedding_size\n",
-    "from pathlib import Path"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"Preparing the encoder, the synthesizer and the vocoder...\")\n",
-    "encoder.load_model(Path('encoder/saved_models/pretrained.pt'))\n",
-    "print(\"Testing your configuration with small inputs.\")\n",
-    "# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's\n",
-    "# sampling rate, which may differ.\n",
-    "# If you're unfamiliar with digital audio, know that it is encoded as an array of floats \n",
-    "# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.\n",
-    "# The sampling rate is the number of values (samples) recorded per second, it is set to\n",
-    "# 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond \n",
-    "# to an audio of 1 second.\n",
-    "print(\"\\tTesting the encoder...\")\n",
-    "\n",
-    "wav = np.zeros(encoder.sampling_rate)    \n",
-    "embed = encoder.embed_utterance(wav)\n",
-    "print(embed.shape)\n",
-    "\n",
-    "# Embeddings are L2-normalized (this isn't important here, but if you want to make your own \n",
-    "# embeddings it will be).\n",
-    "#embed /= np.linalg.norm(embed) # for random embedding\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "SAVE_PATH = '../'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Set constants\n",
-    "DATASETS_NAME = ['vctk'] # list the datasets\n",
-    "DATASETS_PATH = ['../../../../../datasets/VCTK-Corpus-removed-silence/']\n",
-    "DATASETS_METAFILE = ['']\n",
-    "USE_CUDA = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Preprocess dataset\n",
-    "meta_data = []\n",
-    "for i in range(len(DATASETS_NAME)):\n",
-    "    preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
-    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
-    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
-    "      \n",
-    "meta_data= list(meta_data)\n",
-    "\n",
-    "meta_data = meta_data\n",
-    "embeddings_dict = {}\n",
-    "len_meta_data= len(meta_data)\n",
-    "for i in tqdm(range(len_meta_data)):\n",
-    "    _, wave_file_path, speaker_id = meta_data[i]\n",
-    "    wav_file_name = os.path.basename(wave_file_path)\n",
-    "    # Extract Embedding\n",
-    "    preprocessed_wav = encoder.preprocess_wav(wave_file_path)\n",
-    "    file_embedding = encoder.embed_utterance(preprocessed_wav)\n",
-    "    embeddings_dict[wav_file_name] = [file_embedding.reshape(-1).tolist(), speaker_id]\n",
-    "    del file_embedding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create and export speakers.json  and aplly a L2_norm in embedding\n",
-    "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0]} for i, sample in enumerate(embeddings_dict.keys())}\n",
-    "save_speaker_mapping(SAVE_PATH, speaker_mapping)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#test load integrity\n",
-    "speaker_mapping_load = load_speaker_mapping(SAVE_PATH)\n",
-    "assert speaker_mapping == speaker_mapping_load\n",
-    "print(\"The file speakers.json has been exported to \",SAVE_PATH, ' with ', len(embeddings_dict.keys()), ' samples')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/ExtractSpeakerEmbeddings-by-sample.ipynb
@ -1,163 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This is a noteboook used to generate the speaker embeddings with the GE2E speaker encoder model for multi-speaker training.\n",
-    "\n",
-    "Before running this script please DON'T FORGET: \n",
-    "- to set file paths.\n",
-    "- to download related model files from TTS.\n",
-    "- download or clone related repos, linked below.\n",
-    "- setup the repositories. ```python setup.py install```\n",
-    "- to checkout right commit versions (given next to the model) of TTS.\n",
-    "- to set the right paths in the cell below.\n",
-    "\n",
-    "Repository:\n",
-    "- TTS: https://github.com/coqui/TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "import os\n",
-    "import importlib\n",
-    "import random\n",
-    "import librosa\n",
-    "import torch\n",
-    "\n",
-    "import numpy as np\n",
-    "from tqdm import tqdm\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "\n",
-    "# you may need to change this depending on your system\n",
-    "os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
-    "\n",
-    "\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "from TTS.utils.audio import AudioProcessor\n",
-    "from TTS.utils.io import load_config"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You should also adjust all the path constants to point at the relevant locations for you locally"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "MODEL_RUN_PATH = \"../../Coqui-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
-    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
-    "\n",
-    "\n",
-    "DATASETS_NAME = ['vctk'] # list the datasets\n",
-    "DATASETS_PATH = ['../../../datasets/VCTK/']\n",
-    "DATASETS_METAFILE = ['']\n",
-    "\n",
-    "USE_CUDA = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Preprocess dataset\n",
-    "meta_data = []\n",
-    "for i in range(len(DATASETS_NAME)):\n",
-    "    preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
-    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
-    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
-    "      \n",
-    "meta_data= list(meta_data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "c = load_config(CONFIG_PATH)\n",
-    "ap = AudioProcessor(**c['audio'])\n",
-    "\n",
-    "model = SpeakerEncoder(**c.model)\n",
-    "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
-    "model.eval()\n",
-    "if USE_CUDA:\n",
-    "    model.cuda()\n",
-    "\n",
-    "embeddings_dict = {}\n",
-    "len_meta_data= len(meta_data)\n",
-    "\n",
-    "for i in tqdm(range(len_meta_data)):\n",
-    "    _, wav_file, speaker_id = meta_data[i]\n",
-    "    wav_file_name = os.path.basename(wav_file)\n",
-    "    mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
-    "    mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
-    "    if USE_CUDA:\n",
-    "        mel_spec = mel_spec.cuda()\n",
-    "    embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
-    "    embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create and export speakers.json\n",
-    "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
-    "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#test load integrity\n",
-    "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
-    "assert speaker_mapping == speaker_mapping_load\n",
-    "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/tests/inputs/test_speaker_encoder_config.json
+++ b/tests/inputs/test_speaker_encoder_config.json
@ -46,6 +46,7 @@
    "batch_size": 32,
    "output_path": "", // DATASET-RELATED: output path for all training outputs.
    "model_params": {
+        "model_name": "lstm",
        "input_dim": 40,
        "proj_dim": 256,
        "lstm_dim": 768,
@ -54,8 +55,7 @@
    },
    "storage": {
        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
-        "storage_size": 15,   // the size of the in-memory storage with respect to a single batch
-        "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
+        "storage_size": 15  // the size of the in-memory storage with respect to a single batch
    },
    "datasets":null
 }
--- a/tests/test_speaker_encoder.py
+++ b/tests/test_speaker_encoder.py
@ -3,18 +3,18 @@ import unittest
 import torch as T

 from tests import get_tests_input_path
-from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss
-from TTS.speaker_encoder.model import SpeakerEncoder
-
+from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
+from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
+from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
 file_path = get_tests_input_path()


-class SpeakerEncoderTests(unittest.TestCase):
+class LSTMSpeakerEncoderTests(unittest.TestCase):
    # pylint: disable=R0201
    def test_in_out(self):
        dummy_input = T.rand(4, 20, 80)  # B x T x D
        dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
-        model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3)
+        model = LSTMSpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3)
        # computing d vectors
        output = model.forward(dummy_input)
        assert output.shape[0] == 4
@ -39,6 +39,31 @@ class SpeakerEncoderTests(unittest.TestCase):
        assert output.shape[1] == 256
        assert len(output.shape) == 2

+class ResNetSpeakerEncoderTests(unittest.TestCase):
+    # pylint: disable=R0201
+    def test_in_out(self):
+        dummy_input = T.rand(4, 20, 80)  # B x T x D
+        dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
+        model = ResNetSpeakerEncoder(input_dim=80, proj_dim=256)
+        # computing d vectors
+        output = model.forward(dummy_input)
+        assert output.shape[0] == 4
+        assert output.shape[1] == 256
+        output = model.forward(dummy_input, l2_norm=True)
+        assert output.shape[0] == 4
+        assert output.shape[1] == 256
+
+        # check normalization
+        output_norm = T.nn.functional.normalize(output, dim=1, p=2)
+        assert_diff = (output_norm - output).sum().item()
+        assert output.type() == "torch.FloatTensor"
+        assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}"
+        # compute d for a given batch
+        dummy_input = T.rand(1, 240, 80)  # B x T x D
+        output = model.compute_embedding(dummy_input, num_frames=160, num_eval=10)
+        assert output.shape[0] == 1
+        assert output.shape[1] == 256
+        assert len(output.shape) == 2

 class GE2ELossTests(unittest.TestCase):
    # pylint: disable=R0201
@ -67,7 +92,6 @@ class GE2ELossTests(unittest.TestCase):
        output = loss.forward(dummy_input)
        assert output.item() < 0.005

-
 class AngleProtoLossTests(unittest.TestCase):
    # pylint: disable=R0201
    def test_in_out(self):
@ -97,16 +121,23 @@ class AngleProtoLossTests(unittest.TestCase):
        output = loss.forward(dummy_input)
        assert output.item() < 0.005

+class SoftmaxAngleProtoLossTests(unittest.TestCase):
+    # pylint: disable=R0201
+    def test_in_out(self):

-# class LoaderTest(unittest.TestCase):
-#     def test_output(self):
-#         items = libri_tts("/home/erogol/Data/Libri-TTS/train-clean-360/")
-#         ap = AudioProcessor(**c['audio'])
-#         dataset = MyDataset(ap, items, 1.6, 64, 10)
-#         loader = DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
-#         count = 0
-#         for mel, spk in loader:
-#             print(mel.shape)
-#             if count == 4:
-#                 break
-#             count += 1
+        embedding_dim = 64
+        num_speakers = 5
+        batch_size = 4
+
+        dummy_label = T.randint(low=0, high=num_speakers, size=(batch_size, num_speakers))
+        # check random input
+        dummy_input = T.rand(batch_size, num_speakers, embedding_dim)  # num_speaker x num_utterance x dim
+        loss = SoftmaxAngleProtoLoss(embedding_dim=embedding_dim, n_speakers=num_speakers)
+        output = loss.forward(dummy_input, dummy_label)
+        assert output.item() >= 0.0
+
+        # check all zeros
+        dummy_input = T.ones(batch_size, num_speakers, embedding_dim)  # num_speaker x num_utterance x dim
+        loss = SoftmaxAngleProtoLoss(embedding_dim=embedding_dim, n_speakers=num_speakers)
+        output = loss.forward(dummy_input, dummy_label)
+        assert output.item() >= 0.0
--- a/tests/test_speaker_encoder_train.py
+++ b/tests/test_speaker_encoder_train.py
@ -9,7 +9,6 @@ from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig
 config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
 output_path = os.path.join(get_tests_output_path(), "train_outputs")

-
 config = SpeakerEncoderConfig(
    batch_size=4,
    num_speakers_in_batch=1,
@ -19,7 +18,7 @@ config = SpeakerEncoderConfig(
    print_step=1,
    save_step=1,
    print_eval=True,
-    audio=BaseAudioConfig(num_mels=40),
+    audio=BaseAudioConfig(num_mels=80),
 )
 config.audio.do_trim_silence = True
 config.audio.trim_db = 60
@ -45,3 +44,28 @@ command_train = (
 )
 run_cli(command_train)
 shutil.rmtree(continue_path)
+
+# test resnet speaker encoder
+config.model_params['model_name'] = "resnet"
+config.save_json(config_path)
+
+# train the model for one epoch
+command_train = (
+    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
+    f"--coqpit.output_path {output_path} "
+    "--coqpit.datasets.0.name ljspeech "
+    "--coqpit.datasets.0.meta_file_train metadata.csv "
+    "--coqpit.datasets.0.meta_file_val metadata.csv "
+    "--coqpit.datasets.0.path tests/data/ljspeech "
+)
+run_cli(command_train)
+
+# Find latest folder
+continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+
+# restore the model and continue training for one more epoch
+command_train = (
+    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
+)
+run_cli(command_train)
+shutil.rmtree(continue_path)
--- a/tests/test_speaker_manager.py
+++ b/tests/test_speaker_manager.py
@ -6,7 +6,7 @@ import torch

 from tests import get_tests_input_path
 from TTS.config import load_config
-from TTS.speaker_encoder.model import SpeakerEncoder
+from TTS.speaker_encoder.utils.generic_utils import setup_model
 from TTS.speaker_encoder.utils.io import save_checkpoint
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
@ -28,7 +28,7 @@ class SpeakerManagerTest(unittest.TestCase):
        config.audio.resample = True

        # create a dummy speaker encoder
-        model = SpeakerEncoder(**config.model_params)
+        model = setup_model(config)
        save_checkpoint(model, None, None, get_tests_input_path(), 0)

        # load audio processor and speaker encoder