diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index dae3ebac..3222c278 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -42,7 +42,8 @@ def setup_loader(ap, is_val=False, verbose=False): dataset = MyDataset(ap, meta_data_eval if is_val else meta_data_train, voice_len=1.6, - num_utter_per_speaker=10, + num_utter_per_speaker=c.num_utters_per_speaker, + num_speakers_in_batch=c.num_speakers_in_batch, skip_speakers=False, storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], @@ -98,11 +99,10 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): epoch_time += step_time # Averaged Loss and Averaged Loader Time - dataset_number_prefetched = 2 * c.num_loader_workers # this is hardcoded in pytorch avg_loss = 0.01 * loss.item() \ + 0.99 * avg_loss if avg_loss != 0 else loss.item() - avg_loader_time = 1/dataset_number_prefetched * loader_time\ - + (dataset_number_prefetched-1) / dataset_number_prefetched * avg_loader_time if avg_loader_time != 0 else loader_time + avg_loader_time = 1/c.num_loader_workers * loader_time + \ + (c.num_loader_workers-1) / c.num_loader_workers * avg_loader_time if avg_loader_time != 0 else loader_time current_lr = optimizer.param_groups[0]['lr'] if global_step % c.steps_plot_stats == 0: diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json index d7c959cf..332f58bb 100644 --- a/TTS/speaker_encoder/config.json +++ b/TTS/speaker_encoder/config.json @@ -36,7 +36,8 @@ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "steps_plot_stats": 10, // number of steps to plot embeddings. "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_utters_per_speaker": 10, // + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. @@ -50,8 +51,8 @@ "use_lstm_with_projection": true }, "storage": { - "sample_from_storage_p": 0.42, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 5, // the size of the in-memory storage with respect to a single batch + "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 15, // the size of the in-memory storage with respect to a single batch "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness }, "datasets": diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 05709080..38757ce9 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -33,8 +33,10 @@ class MyDataset(Dataset): self.additive_noise = float(additive_noise) if self.verbose: print("\n > DataLoader initialization") + print(f" | > Speakers per Batch: {num_speakers_in_batch}") print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters") print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}") + print(f" | > Noise added : {self.additive_noise}") print(f" | > Number of instances : {len(self.items)}") print(f" | > Sequence length: {self.seq_len}") print(f" | > Num speakers: {len(self.speakers)}") diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index 6eaa2358..6358e5a9 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -7,11 +7,9 @@ from TTS.utils.generic_utils import check_argument def split_dataset(items): - is_multi_speaker = False speakers = [item[-1] for item in items] is_multi_speaker = len(set(speakers)) > 1 - eval_split_size = 500 if len(items) * 0.01 > 500 else int( - len(items) * 0.01) + eval_split_size = min(500, int(len(items) * 0.01)) assert eval_split_size > 0, " [!] You do not have enough samples to train. You need at least 100 samples." np.random.seed(0) np.random.shuffle(items) @@ -142,6 +140,11 @@ def check_config(c): check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) check_argument('trim_db', c['audio'], restricted=True, val_type=int) + # storage parameters + check_argument('sample_from_storage_p', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) + check_argument('storage_size', c['storage'], restricted=True, val_type=int, min_val=1, max_val=100) + check_argument('additive_noise', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) + # training parameters check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1)