diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 7e9f662f..a4191dfb 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -12,7 +12,7 @@ from torch.utils.data import DataLoader from TTS.speaker_encoder.dataset import MyDataset from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss -from TTS.speaker_encoder.utils.generic_utils import check_config_speaker_encoder, save_best_model, setup_model +from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.tts.datasets.preprocess import load_meta_data @@ -38,15 +38,16 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False dataset = MyDataset( ap, meta_data_eval if is_val else meta_data_train, - voice_len=getattr(c, "voice_len", 1.6), + voice_len=c.voice_len, num_utter_per_speaker=c.num_utters_per_speaker, num_speakers_in_batch=c.num_speakers_in_batch, - skip_speakers=getattr(c, "skip_speakers", False), + skip_speakers=c.skip_speakers, storage_size=c.storage["storage_size"], sample_from_storage_p=c.storage["sample_from_storage_p"], verbose=verbose, - augmentation_config=getattr(c, "audio_augmentation", None) + augmentation_config=c.audio_augmentation ) + # sampler = DistributedSampler(dataset) if num_gpus > 1 else None loader = DataLoader( dataset, @@ -133,17 +134,15 @@ def train(model, optimizer, scheduler, criterion, data_loader, global_step): ) avg_loss_all += avg_loss - if global_step % c.save_step == 0: - # save best model + if global_step >= c.max_train_step or global_step % c.save_step == 0: + # save best model only best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step) avg_loss_all = 0 - end_time = time.time() - - # checkpoint and check stop train cond. - if global_step >= c.max_train_step or global_step % c.save_step == 0: - save_checkpoint(model, optimizer, avg_loss, OUT_PATH, global_step) if global_step >= c.max_train_step: break + + end_time = time.time() + return avg_loss, global_step diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 69f1ee31..a7976db7 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -226,7 +226,7 @@ class BaseTrainingConfig(Coqpit): run_description: str = "" # training params epochs: int = 10000 - batch_size: int = MISSING + batch_size: int = None eval_batch_size: int = None mixed_precision: bool = False # eval params diff --git a/TTS/speaker_encoder/configs/config_resnet_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_angleproto.json index 7cae1b25..95cf5ccf 100644 --- a/TTS/speaker_encoder/configs/config_resnet_angleproto.json +++ b/TTS/speaker_encoder/configs/config_resnet_angleproto.json @@ -1,5 +1,5 @@ { - "model_name": "resnet", + "model": "speaker_encoder", "run_name": "speaker_encoder", "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", // AUDIO PARAMETERS @@ -34,7 +34,7 @@ "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. + "max_train_step": 1000000, // total number of steps to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" @@ -45,15 +45,14 @@ "num_speakers_in_batch": 200, // Batch size for training. "num_utters_per_speaker": 2, // "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - "voice_len": 2, // number of seconds for each training instance - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. "print_step": 50, // Number of steps to log traning on console. - "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto/", // DATASET-RELATED: output path for all training outputs. + "output_path": "../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto-continue/", // DATASET-RELATED: output path for all training outputs. "audio_augmentation": { "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation @@ -90,12 +89,13 @@ "max_amplitude": 1e-5 } }, - "model": { + "model_params": { + "model_name": "resnet", "input_dim": 80, "proj_dim": 512 }, "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "sample_from_storage_p": 0.5, // the probability with which we'll sample from the DataSet in-memory storage "storage_size": 35 // the size of the in-memory storage with respect to a single batch }, "datasets": diff --git a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json index 4baca49a..ccbd751a 100644 --- a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json +++ b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json @@ -1,6 +1,6 @@ { - "model_name": "resnet", + "model": "speaker_encoder", "run_name": "speaker_encoder", "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", // AUDIO PARAMETERS @@ -35,7 +35,7 @@ "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. + "max_train_step": 1000000, // total number of steps to train. "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. "lr_decay": false, // if true, Noam learning rate decaying is applied through training. "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" @@ -46,7 +46,6 @@ "num_speakers_in_batch": 200, // Batch size for training. "num_utters_per_speaker": 2, // "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - "voice_len": 2, // number of seconds for each training instance "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. @@ -91,7 +90,8 @@ "max_amplitude": 1e-5 } }, - "model": { + "model_params": { + "model_name": "resnet", "input_dim": 80, "proj_dim": 512 }, diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py index 0b673753..45a7bc12 100644 --- a/TTS/speaker_encoder/dataset.py +++ b/TTS/speaker_encoder/dataset.py @@ -240,9 +240,6 @@ class MyDataset(Dataset): labels.append(torch.LongTensor(labels_)) feats.extend(feats_) - if self.num_speakers_in_batch != len(speakers): - raise ValueError('Error: Speakers appear more than once on the Batch. This cannot happen because the loss functions AngleProto and GE2E consider these samples to be from another speaker.') - feats = torch.stack(feats) labels = torch.stack(labels) diff --git a/TTS/speaker_encoder/losses.py b/TTS/speaker_encoder/losses.py index 64f0773d..9b573b6d 100644 --- a/TTS/speaker_encoder/losses.py +++ b/TTS/speaker_encoder/losses.py @@ -103,7 +103,7 @@ class GE2ELoss(nn.Module): L.append(L_row) return torch.stack(L) - def forward(self, x, label=None): + def forward(self, x, _label=None): """ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) """ @@ -141,8 +141,7 @@ class AngleProtoLoss(nn.Module): print(" > Initialized Angular Prototypical loss") - # pylint: disable=W0613 - def forward(self, x, label=None): + def forward(self, x, _label=None): """ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) """ diff --git a/TTS/speaker_encoder/speaker_encoder_config.py b/TTS/speaker_encoder/speaker_encoder_config.py index dcba3b6c..31149822 100644 --- a/TTS/speaker_encoder/speaker_encoder_config.py +++ b/TTS/speaker_encoder/speaker_encoder_config.py @@ -13,11 +13,11 @@ class SpeakerEncoderConfig(BaseTrainingConfig): model: str = "speaker_encoder" audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) - # model params model_params: dict = field( default_factory=lambda: { - "input_dim": 40, + "model_name": "lstm", + "input_dim": 80, "proj_dim": 256, "lstm_dim": 768, "num_lstm_layers": 3, @@ -25,16 +25,20 @@ class SpeakerEncoderConfig(BaseTrainingConfig): } ) + audio_augmentation : dict = field( + default_factory=lambda: { + } + ) + storage: dict = field( default_factory=lambda: { "sample_from_storage_p": 0.66, # the probability with which we'll sample from the DataSet in-memory storage "storage_size": 15, # the size of the in-memory storage with respect to a single batch - "additive_noise": 1e-5, # add very small gaussian noise to the data in order to increase robustness } ) # training params - max_train_step: int = 1000 # end training when number of training steps reaches this value. + max_train_step: int = 1000000 # end training when number of training steps reaches this value. loss: str = "angleproto" grad_clip: float = 3.0 lr: float = 0.0001 @@ -53,6 +57,8 @@ class SpeakerEncoderConfig(BaseTrainingConfig): num_speakers_in_batch: int = MISSING num_utters_per_speaker: int = MISSING num_loader_workers: int = MISSING + skip_speakers: bool = False + voice_len: float = 1.6 def check_values(self): super().check_values() diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/speaker_encoder/utils/generic_utils.py index f3c4b8f8..3299f75a 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/speaker_encoder/utils/generic_utils.py @@ -1,16 +1,17 @@ import re +import os import numpy as np import torch import glob import random +import datetime from scipy import signal from multiprocessing import Manager from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder -from TTS.utils.generic_utils import check_argument class Storage(object): def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8): @@ -157,10 +158,10 @@ def to_camel(text): return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) def setup_model(c): - if c.model_name.lower() == 'lstm': - model = LSTMSpeakerEncoder(c.model["input_dim"], c.model["proj_dim"], c.model["lstm_dim"], c.model["num_lstm_layers"]) - elif c.model_name.lower() == 'resnet': - model = ResNetSpeakerEncoder(input_dim=c.model["input_dim"], proj_dim=c.model["proj_dim"]) + if c.model_params['model_name'].lower() == 'lstm': + model = LSTMSpeakerEncoder(c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"]) + elif c.model_params['model_name'].lower() == 'resnet': + model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"]) return model def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): @@ -198,75 +199,3 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) torch.save(state, bestmodel_path) return best_loss - - -def check_config_speaker_encoder(c): - """Check the config.json file of the speaker encoder""" - check_argument("run_name", c, restricted=True, val_type=str) - check_argument("run_description", c, val_type=str) - - # audio processing parameters - check_argument("audio", c, restricted=True, val_type=dict) - check_argument("num_mels", c["audio"], restricted=True, val_type=int, min_val=10, max_val=2056) - check_argument("fft_size", c["audio"], restricted=True, val_type=int, min_val=128, max_val=4058) - check_argument("sample_rate", c["audio"], restricted=True, val_type=int, min_val=512, max_val=100000) - check_argument( - "frame_length_ms", - c["audio"], - restricted=True, - val_type=float, - min_val=10, - max_val=1000, - alternative="win_length", - ) - check_argument( - "frame_shift_ms", c["audio"], restricted=True, val_type=float, min_val=1, max_val=1000, alternative="hop_length" - ) - check_argument("preemphasis", c["audio"], restricted=True, val_type=float, min_val=0, max_val=1) - check_argument("min_level_db", c["audio"], restricted=True, val_type=int, min_val=-1000, max_val=10) - check_argument("ref_level_db", c["audio"], restricted=True, val_type=int, min_val=0, max_val=1000) - check_argument("power", c["audio"], restricted=True, val_type=float, min_val=1, max_val=5) - check_argument("griffin_lim_iters", c["audio"], restricted=True, val_type=int, min_val=10, max_val=1000) - - # training parameters - check_argument("loss", c, enum_list=["ge2e", "angleproto", "softmaxproto"], restricted=True, val_type=str) - check_argument("grad_clip", c, restricted=True, val_type=float) - check_argument("epochs", c, restricted=True, val_type=int, min_val=1) - check_argument("lr", c, restricted=True, val_type=float, min_val=0) - check_argument("lr_decay", c, restricted=True, val_type=bool) - check_argument("warmup_steps", c, restricted=True, val_type=int, min_val=0) - check_argument("tb_model_param_stats", c, restricted=True, val_type=bool) - check_argument("num_speakers_in_batch", c, restricted=True, val_type=int) - check_argument("num_loader_workers", c, restricted=True, val_type=int) - check_argument("wd", c, restricted=True, val_type=float, min_val=0.0, max_val=1.0) - - # checkpoint and output parameters - check_argument("steps_plot_stats", c, restricted=True, val_type=int) - check_argument("checkpoint", c, restricted=True, val_type=bool) - check_argument("save_step", c, restricted=True, val_type=int) - check_argument("print_step", c, restricted=True, val_type=int) - check_argument("output_path", c, restricted=True, val_type=str) - - # model parameters - check_argument("model", c, restricted=True, val_type=dict) - check_argument("model_name", c, restricted=True, val_type=str) - check_argument("input_dim", c["model"], restricted=True, val_type=int) - check_argument("proj_dim", c["model"], restricted=True, val_type=int) - - if c.model_name.lower() == 'lstm': - check_argument("lstm_dim", c["model"], restricted=True, val_type=int) - check_argument("num_lstm_layers", c["model"], restricted=True, val_type=int) - check_argument("use_lstm_with_projection", c["model"], restricted=True, val_type=bool) - - # in-memory storage parameters - check_argument("storage", c, restricted=True, val_type=dict) - check_argument("sample_from_storage_p", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0) - check_argument("storage_size", c["storage"], restricted=True, val_type=int, min_val=1, max_val=100) - - # datasets - checking only the first entry - check_argument("datasets", c, restricted=True, val_type=list) - for dataset_entry in c["datasets"]: - check_argument("name", dataset_entry, restricted=True, val_type=str) - check_argument("path", dataset_entry, restricted=True, val_type=str) - check_argument("meta_file_train", dataset_entry, restricted=True, val_type=[str, list]) - check_argument("meta_file_val", dataset_entry, restricted=True, val_type=str)