diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 56a2b954..dae3ebac 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -46,6 +46,7 @@ def setup_loader(ap, is_val=False, verbose=False): skip_speakers=False, storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], + additive_noise=c.storage["additive_noise"], verbose=verbose) # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader(dataset, diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index f350779d..d7c959cf 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -27,7 +27,7 @@ "trim_db": 60 // threshold for timming silence. Set this according to your dataset. }, "reinit_layers": [], - "loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) + "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) "grad_clip": 3.0, // upper limit for gradients for clipping. "epochs": 1000, // total number of epochs to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. @@ -35,12 +35,12 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. + "print_step": 20, // Number of steps to log traning on console. "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. "model": { "input_dim": 40, @@ -51,7 +51,8 @@ }, "storage": { "sample_from_storage_p": 0.42, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 5 // the size of the in-memory storage with respect to a single batch + "storage_size": 5, // the size of the in-memory storage with respect to a single batch + "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness }, "datasets": [ @@ -60,42 +61,42 @@ "path": "../../audio-datasets/en/VCTK-Corpus/", "meta_file_train": null, "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-clean-100", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-clean-360", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../audio-datasets/en/LibriTTS/train-other-500", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "voxceleb1", + "path": "../../audio-datasets/en/voxceleb1/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "voxceleb2", + "path": "../../audio-datasets/en/voxceleb2/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "common_voice_wav", + "path": "../../audio-datasets/en/MozillaCommonVoice", + "meta_file_train": "train.tsv", + "meta_file_val": "test.tsv" } -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-clean-100", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-clean-360", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "libri_tts", -// "path": "../../audio-datasets/en/LibriTTS/train-other-500", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "voxceleb1", -// "path": "../../audio-datasets/en/voxceleb1/", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "voxceleb2", -// "path": "../../audio-datasets/en/voxceleb2/", -// "meta_file_train": null, -// "meta_file_val": null -// }, -// { -// "name": "common_voice_wav", -// "path": "../../audio-datasets/en/MozillaCommonVoice", -// "meta_file_train": "train.tsv", -// "meta_file_val": "test.tsv" -// } ] } \ No newline at end of file diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 3f3db88d..05709080 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -1,3 +1,4 @@ +import numpy import numpy as np import queue import torch @@ -8,7 +9,7 @@ from tqdm import tqdm class MyDataset(Dataset): def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64, - storage_size=1, sample_from_storage_p=0.5, + storage_size=1, sample_from_storage_p=0.5, additive_noise=0, num_utter_per_speaker=10, skip_speakers=False, verbose=False): """ Args: @@ -29,6 +30,7 @@ class MyDataset(Dataset): self.__parse_items() self.storage = queue.Queue(maxsize=storage_size*num_speakers_in_batch) self.sample_from_storage_p = float(sample_from_storage_p) + self.additive_noise = float(additive_noise) if self.verbose: print("\n > DataLoader initialization") print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters") @@ -150,6 +152,11 @@ class MyDataset(Dataset): # put the newly loaded item into storage self.storage.put_nowait((wavs_, labels_)) + # add random gaussian noise + if self.additive_noise > 0: + noises_ = [numpy.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_] + wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))] + # get a random subset of each of the wavs and convert to MFCC. offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_] mels_ = [self.ap.melspectrogram(wavs_[i][offsets_[i]: offsets_[i] + self.seq_len]) for i in range(len(wavs_))]