diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index f26ef6f7..95030309 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -11,11 +11,12 @@ import torch from torch.utils.data import DataLoader from TTS.speaker_encoder.dataset import MyDataset -from TTS.speaker_encoder.utils.generic_utils import save_best_model from TTS.speaker_encoder.losses import GE2ELoss, AngleProtoLoss from TTS.speaker_encoder.model import SpeakerEncoder -from TTS.speaker_encoder.utils.visual import plot_embeddings +from TTS.speaker_encoder.utils import check_config_speaker_encoder +from TTS.speaker_encoder.visuals import plot_embeddings from TTS.tts.datasets.preprocess import load_meta_data +from TTS.tts.utils.io import save_best_model from TTS.utils.generic_utils import ( create_experiment_folder, get_git_branch, remove_experiment_folder, set_init_dict) @@ -235,6 +236,7 @@ if __name__ == '__main__': # setup output paths and read configs c = load_config(args.config_path) + check_config_speaker_encoder(c) _ = os.path.dirname(os.path.realpath(__file__)) if args.data_path != '': c.data_path = args.data_path diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index f2641f9d..1b7351d4 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -17,7 +17,7 @@ from TTS.tts.layers.losses import TacotronLoss from TTS.tts.utils.distribute import (DistributedSampler, apply_gradient_allreduce, init_distributed, reduce_tensor) -from TTS.tts.utils.generic_utils import check_config, setup_model +from TTS.tts.utils.generic_utils import setup_model, check_config_tts from TTS.tts.utils.io import save_best_model, save_checkpoint from TTS.tts.utils.measures import alignment_diagonal_score from TTS.tts.utils.speakers import (get_speakers, load_speaker_mapping, @@ -670,7 +670,7 @@ if __name__ == '__main__': # setup output paths and read configs c = load_config(args.config_path) - check_config(c) + check_config_tts(c) _ = os.path.dirname(os.path.realpath(__file__)) if c.apex_amp_level == 'O1': diff --git a/TTS/speaker_encoder/config.json b/TTS/speaker_encoder/config.json new file mode 100644 index 00000000..4fbd84cc --- /dev/null +++ b/TTS/speaker_encoder/config.json @@ -0,0 +1,103 @@ + +{ + "run_name": "mueller91", + "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", + "audio":{ + // Audio processing parameters + "num_mels": 40, // size of the mel spec frame. + "fft_size": 400, // number of stft frequency levels. Size of the linear spectogram frame. + "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. + "win_length": 400, // stft window length in ms. + "hop_length": 160, // stft window hop-lengh in ms. + "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. + "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. + "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. + "min_level_db": -100, // normalization range + "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. + "power": 1.5, // value to sharpen wav signals after GL algorithm. + "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. + // Normalization parameters + "signal_norm": true, // normalize the spec values in range [0, 1] + "symmetric_norm": true, // move normalization to range [-1, 1] + "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] + "clip_norm": true, // clip normalized values into the range. + "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! + "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) + "trim_db": 60 // threshold for timming silence. Set this according to your dataset. + }, + "reinit_layers": [], + "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) + "grad_clip": 3.0, // upper limit for gradients for clipping. + "epochs": 1000, // total number of epochs to train. + "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. + "lr_decay": false, // if true, Noam learning rate decaying is applied through training. + "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" + "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. + "steps_plot_stats": 10, // number of steps to plot embeddings. + "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "num_utters_per_speaker": 10, // + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "wd": 0.000001, // Weight decay weight. + "checkpoint": true, // If true, it saves checkpoints per "save_step" + "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. + "print_step": 20, // Number of steps to log traning on console. + "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. + "model": { + "input_dim": 40, + "proj_dim": 256, + "lstm_dim": 768, + "num_lstm_layers": 3, + "use_lstm_with_projection": true + }, + "storage": { + "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage + "storage_size": 15, // the size of the in-memory storage with respect to a single batch + "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness + }, + "datasets": + [ + { + "name": "vctk_slim", + "path": "../../../audio-datasets/en/VCTK-Corpus/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../../audio-datasets/en/LibriTTS/train-clean-100", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../../audio-datasets/en/LibriTTS/train-clean-360", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "libri_tts", + "path": "../../../audio-datasets/en/LibriTTS/train-other-500", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "voxceleb1", + "path": "../../../audio-datasets/en/voxceleb1/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "voxceleb2", + "path": "../../../audio-datasets/en/voxceleb2/", + "meta_file_train": null, + "meta_file_val": null + }, + { + "name": "common_voice", + "path": "../../../audio-datasets/en/MozillaCommonVoice", + "meta_file_train": "train.tsv", + "meta_file_val": "test.tsv" + } + ] +} \ No newline at end of file diff --git a/TTS/speaker_encoder/utils.py b/TTS/speaker_encoder/utils.py new file mode 100644 index 00000000..95c222f2 --- /dev/null +++ b/TTS/speaker_encoder/utils.py @@ -0,0 +1,61 @@ +from TTS.utils.generic_utils import check_argument + + +def check_config_speaker_encoder(c): + """Check the config.json file of the speaker encoder""" + check_argument('run_name', c, restricted=True, val_type=str) + check_argument('run_description', c, val_type=str) + + # audio processing parameters + check_argument('audio', c, restricted=True, val_type=dict) + check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) + check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) + check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) + check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') + check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') + check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) + check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) + check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) + check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) + check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) + + # training parameters + check_argument('loss', c, enum_list=['ge2e', 'angleproto'], restricted=True, val_type=str) + check_argument('grad_clip', c, restricted=True, val_type=float) + check_argument('epochs', c, restricted=True, val_type=int, min_val=1) + check_argument('lr', c, restricted=True, val_type=float, min_val=0) + check_argument('lr_decay', c, restricted=True, val_type=bool) + check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) + check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) + check_argument('num_speakers_in_batch', c, restricted=True, val_type=int) + check_argument('num_loader_workers', c, restricted=True, val_type=int) + check_argument('wd', c, restricted=True, val_type=float, min_val=0.0, max_val=1.0) + + # checkpoint and output parameters + check_argument('steps_plot_stats', c, restricted=True, val_type=int) + check_argument('checkpoint', c, restricted=True, val_type=bool) + check_argument('save_step', c, restricted=True, val_type=int) + check_argument('print_step', c, restricted=True, val_type=int) + check_argument('output_path', c, restricted=True, val_type=str) + + # model parameters + check_argument('model', c, restricted=True, val_type=dict) + check_argument('input_dim', c['model'], restricted=True, val_type=int) + check_argument('proj_dim', c['model'], restricted=True, val_type=int) + check_argument('lstm_dim', c['model'], restricted=True, val_type=int) + check_argument('num_lstm_layers', c['model'], restricted=True, val_type=int) + check_argument('use_lstm_with_projection', c['model'], restricted=True, val_type=bool) + + # in-memory storage parameters + check_argument('storage', c, restricted=True, val_type=dict) + check_argument('sample_from_storage_p', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) + check_argument('storage_size', c['storage'], restricted=True, val_type=int, min_val=1, max_val=100) + check_argument('additive_noise', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) + + # datasets - checking only the first entry + check_argument('datasets', c, restricted=True, val_type=list) + for dataset_entry in c['datasets']: + check_argument('name', dataset_entry, restricted=True, val_type=str) + check_argument('path', dataset_entry, restricted=True, val_type=str) + check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list]) + check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) diff --git a/TTS/speaker_encoder/visuals.py b/TTS/speaker_encoder/visuals.py new file mode 100644 index 00000000..68c48f12 --- /dev/null +++ b/TTS/speaker_encoder/visuals.py @@ -0,0 +1,46 @@ +import umap +import numpy as np +import matplotlib +import matplotlib.pyplot as plt + +matplotlib.use("Agg") + + +colormap = ( + np.array( + [ + [76, 255, 0], + [0, 127, 70], + [255, 0, 0], + [255, 217, 38], + [0, 135, 255], + [165, 0, 165], + [255, 167, 255], + [0, 255, 255], + [255, 96, 38], + [142, 76, 0], + [33, 0, 127], + [0, 0, 0], + [183, 183, 183], + ], + dtype=np.float, + ) + / 255 +) + + +def plot_embeddings(embeddings, num_utter_per_speaker): + embeddings = embeddings[: 10 * num_utter_per_speaker] + model = umap.UMAP() + projection = model.fit_transform(embeddings) + num_speakers = embeddings.shape[0] // num_utter_per_speaker + ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker) + colors = [colormap[i] for i in ground_truth] + + fig, ax = plt.subplots(figsize=(16, 10)) + _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) + plt.gca().set_aspect("equal", "datalim") + plt.title("UMAP projection") + plt.tight_layout() + plt.savefig("umap") + return fig diff --git a/TTS/tts/utils/generic_utils.py b/TTS/tts/utils/generic_utils.py index 0ff462dd..a0aba29a 100644 --- a/TTS/tts/utils/generic_utils.py +++ b/TTS/tts/utils/generic_utils.py @@ -127,7 +127,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): return model -def check_config(c): +def check_config_tts(c): check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) check_argument('run_name', c, restricted=True, val_type=str) check_argument('run_description', c, val_type=str) @@ -167,11 +167,6 @@ def check_config(c): check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) check_argument('trim_db', c['audio'], restricted=True, val_type=int) - # storage parameters - check_argument('sample_from_storage_p', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) - check_argument('storage_size', c['storage'], restricted=True, val_type=int, min_val=1, max_val=100) - check_argument('additive_noise', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) - # training parameters check_argument('batch_size', c, restricted=True, val_type=int, min_val=1) check_argument('eval_batch_size', c, restricted=True, val_type=int, min_val=1) diff --git a/requirements.txt b/requirements.txt index 6301021e..36387e4d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,5 @@ nose==1.3.7 cardboardlint==1.3.0 pylint==2.5.3 gdown +umap cython