diff --git a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json deleted file mode 100644 index dbea7ef2..00000000 --- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+37AM-debug/config.json +++ /dev/null @@ -1,60 +0,0 @@ - -"github_branch":"* dev-gst-embeddings", -{ - "run_name": "libritts_100+360-angleproto", - "run_description": "train speaker encoder for libritts 100 and 360", - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. - }, - "reinit_layers": [], - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. - "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "model": { - "input_dim": 80, // input_dim == num_mels - "proj_dim": 128, - "lstm_dim": 384, - "num_lstm_layers": 3 - }, - "datasets": - [ - { - "name": "vctk", - "path": "../../../datasets/VCTK-Corpus-removed-silence/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json deleted file mode 100644 index dbea7ef2..00000000 --- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+38AM-debug/config.json +++ /dev/null @@ -1,60 +0,0 @@ - -"github_branch":"* dev-gst-embeddings", -{ - "run_name": "libritts_100+360-angleproto", - "run_description": "train speaker encoder for libritts 100 and 360", - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. - }, - "reinit_layers": [], - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. - "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "model": { - "input_dim": 80, // input_dim == num_mels - "proj_dim": 128, - "lstm_dim": 384, - "num_lstm_layers": 3 - }, - "datasets": - [ - { - "name": "vctk", - "path": "../../../datasets/VCTK-Corpus-removed-silence/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json b/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json deleted file mode 100644 index dbea7ef2..00000000 --- a/checkpoints/libri_tts/speaker_encoder/libritts_100+360-angleproto-July-31-2020_12+40AM-debug/config.json +++ /dev/null @@ -1,60 +0,0 @@ - -"github_branch":"* dev-gst-embeddings", -{ - "run_name": "libritts_100+360-angleproto", - "run_description": "train speaker encoder for libritts 100 and 360", - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "num_freq": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60 // threshold for timming silence. Set this according to your dataset. - }, - "reinit_layers": [], - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 1, // Number of steps to log traning on console. - "output_path": "../../checkpoints/libri_tts/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "model": { - "input_dim": 80, // input_dim == num_mels - "proj_dim": 128, - "lstm_dim": 384, - "num_lstm_layers": 3 - }, - "datasets": - [ - { - "name": "vctk", - "path": "../../../datasets/VCTK-Corpus-removed-silence/", - "meta_file_train": null, - "meta_file_val": null - } - ] -} \ No newline at end of file diff --git a/mozilla_voice_tts/speaker_encoder/losses.py b/mozilla_voice_tts/speaker_encoder/losses.py index 7feced64..9065ccfd 100644 --- a/mozilla_voice_tts/speaker_encoder/losses.py +++ b/mozilla_voice_tts/speaker_encoder/losses.py @@ -23,7 +23,7 @@ class GE2ELoss(nn.Module): self.b = nn.Parameter(torch.tensor(init_b)) self.loss_method = loss_method - print('Initialised Generalized End-to-End loss') + print(' > Initialised Generalized End-to-End loss') assert self.loss_method in ["softmax", "contrast"] @@ -142,7 +142,7 @@ class AngleProtoLoss(nn.Module): self.b = nn.Parameter(torch.tensor(init_b)) self.criterion = torch.nn.CrossEntropyLoss() - print('Initialised Angular Prototypical loss') + print(' > Initialised Angular Prototypical loss') def forward(self, x): """ diff --git a/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb new file mode 100644 index 00000000..d660a7f5 --- /dev/null +++ b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n", + "\n", + "Before running this script please DON'T FORGET: \n", + "- to set file paths.\n", + "- to download related model files from TTS.\n", + "- download or clone related repos, linked below.\n", + "- setup the repositories. ```python setup.py install```\n", + "- to checkout right commit versions (given next to the model) of TTS.\n", + "- to set the right paths in the cell below.\n", + "\n", + "Repository:\n", + "- TTS: https://github.com/mozilla/TTS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import os\n", + "import importlib\n", + "import random\n", + "import librosa\n", + "import torch\n", + "\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "\n", + "# you may need to change this depending on your system\n", + "os.environ['CUDA_VISIBLE_DEVICES']='0'\n", + "\n", + "\n", + "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.io import load_config" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should also adjust all the path constants to point at the relevant locations for you locally" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n", + "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", + "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", + "\n", + "\n", + "DATASETS_NAME = ['vctk'] # list the datasets\n", + "DATASETS_PATH = ['../../../datasets/VCTK/']\n", + "DATASETS_METAFILE = ['']\n", + "\n", + "USE_CUDA = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Preprocess dataset\n", + "meta_data = []\n", + "for i in range(len(DATASETS_NAME)):\n", + " preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", + " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", + " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", + " \n", + "meta_data= list(meta_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "c = load_config(CONFIG_PATH)\n", + "ap = AudioProcessor(**c['audio'])\n", + "\n", + "model = SpeakerEncoder(**c.model)\n", + "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n", + "model.eval()\n", + "if USE_CUDA:\n", + " model.cuda()\n", + "\n", + "embeddings_dict = {}\n", + "len_meta_data= len(meta_data)\n", + "\n", + "for i in tqdm(range(len_meta_data)):\n", + " _, wav_file, speaker_id = meta_data[i]\n", + " wav_file_name = os.path.basename(wav_file)\n", + " mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n", + " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", + " if USE_CUDA:\n", + " mel_spec = mel_spec.cuda()\n", + " embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", + " embeddings_dict[wav_file_name] = [embedd,speaker_id]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create and export speakers.json\n", + "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n", + "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#test load integrity\n", + "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n", + "assert speaker_mapping == speaker_mapping_load\n", + "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb index 324de2d0..2fba4d49 100644 --- a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb +++ b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb @@ -63,9 +63,9 @@ "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", "\n", "\n", - "DATASETS_NAME = ['brspeech'] # list the datasets\n", - "DATASETS_PATH = ['../../../datasets/BRSpeech-2.0-beta8']\n", - "DATASETS_METAFILE = ['TTS_metadata_brspeech2+cv_all_valited_lines.csv']\n", + "DATASETS_NAME = ['vctk'] # list the datasets\n", + "DATASETS_PATH = ['../../../datasets/VCTK/']\n", + "DATASETS_METAFILE = ['']\n", "\n", "USE_CUDA = True" ]