diff --git a/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb deleted file mode 100644 index 15206130..00000000 --- a/notebooks/AngleProto-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb +++ /dev/null @@ -1,163 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related model files from TTS.\n", - "- download or clone related repos, linked below.\n", - "- setup the repositories. ```python setup.py install```\n", - "- to checkout right commit versions (given next to the model) of TTS.\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repository:\n", - "- TTS: https://github.com/mozilla/TTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "import importlib\n", - "import random\n", - "import librosa\n", - "import torch\n", - "\n", - "import numpy as np\n", - "from tqdm import tqdm\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='0'\n", - "\n", - "\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.io import load_config" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should also adjust all the path constants to point at the relevant locations for you locally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", - "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", - "\n", - "\n", - "DATASETS_NAME = ['vctk'] # list the datasets\n", - "DATASETS_PATH = ['../../../datasets/VCTK/']\n", - "DATASETS_METAFILE = ['']\n", - "\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Preprocess dataset\n", - "meta_data = []\n", - "for i in range(len(DATASETS_NAME)):\n", - " preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", - " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", - " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", - " \n", - "meta_data= list(meta_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "c = load_config(CONFIG_PATH)\n", - "ap = AudioProcessor(**c['audio'])\n", - "\n", - "model = SpeakerEncoder(**c.model)\n", - "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n", - "model.eval()\n", - "if USE_CUDA:\n", - " model.cuda()\n", - "\n", - "embeddings_dict = {}\n", - "len_meta_data= len(meta_data)\n", - "\n", - "for i in tqdm(range(len_meta_data)):\n", - " _, wav_file, speaker_id = meta_data[i]\n", - " wav_file_name = os.path.basename(wav_file)\n", - " mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n", - " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", - " if USE_CUDA:\n", - " mel_spec = mel_spec.cuda()\n", - " embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", - " embeddings_dict[wav_file_name] = [embedd,speaker_id]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create and export speakers.json\n", - "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n", - "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#test load integrity\n", - "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n", - "assert speaker_mapping == speaker_mapping_load\n", - "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb deleted file mode 100644 index 1c4e8759..00000000 --- a/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb +++ /dev/null @@ -1,212 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a noteboook used to generate the speaker embeddings with the CorentinJ GE2E model trained with Angular Prototypical loss for multi-speaker training.\n", - "\n", - "Before running this script please DON'T FORGET:\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repositories:\n", - "- TTS: https://github.com/coqui/TTS\n", - "- CorentinJ GE2E: https://github.com/Edresson/GE2E-Speaker-Encoder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import os\n", - "import importlib\n", - "import random\n", - "import librosa\n", - "import torch\n", - "\n", - "import numpy as np\n", - "from TTS.utils.io import load_config\n", - "from tqdm import tqdm\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='0'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Clone encoder \n", - "!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n", - "os.chdir('Real-Time-Voice-Cloning/')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Install voxceleb_trainer Requeriments\n", - "!python -m pip install umap-learn visdom webrtcvad librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0 scipy>=1.0.0 tqdm sounddevice Unidecode inflect multiprocess numba" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Download encoder Checkpoint\n", - "!wget https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n", - "!unzip pretrained.zip" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from encoder import inference as encoder\n", - "from encoder.params_model import model_embedding_size as speaker_embedding_size\n", - "from pathlib import Path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Preparing the encoder, the synthesizer and the vocoder...\")\n", - "encoder.load_model(Path('encoder/saved_models/pretrained.pt'))\n", - "print(\"Testing your configuration with small inputs.\")\n", - "# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's\n", - "# sampling rate, which may differ.\n", - "# If you're unfamiliar with digital audio, know that it is encoded as an array of floats \n", - "# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.\n", - "# The sampling rate is the number of values (samples) recorded per second, it is set to\n", - "# 16000 for the encoder. Creating an array of length will always correspond \n", - "# to an audio of 1 second.\n", - "print(\"\\tTesting the encoder...\")\n", - "\n", - "wav = np.zeros(encoder.sampling_rate) \n", - "embed = encoder.embed_utterance(wav)\n", - "print(embed.shape)\n", - "\n", - "# Embeddings are L2-normalized (this isn't important here, but if you want to make your own \n", - "# embeddings it will be).\n", - "#embed /= np.linalg.norm(embed) # for random embedding\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SAVE_PATH = '../'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set constants\n", - "DATASETS_NAME = ['vctk'] # list the datasets\n", - "DATASETS_PATH = ['../../../../../datasets/VCTK-Corpus-removed-silence/']\n", - "DATASETS_METAFILE = ['']\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Preprocess dataset\n", - "meta_data = []\n", - "for i in range(len(DATASETS_NAME)):\n", - " preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", - " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", - " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", - " \n", - "meta_data= list(meta_data)\n", - "\n", - "meta_data = meta_data\n", - "embeddings_dict = {}\n", - "len_meta_data= len(meta_data)\n", - "for i in tqdm(range(len_meta_data)):\n", - " _, wave_file_path, speaker_id = meta_data[i]\n", - " wav_file_name = os.path.basename(wave_file_path)\n", - " # Extract Embedding\n", - " preprocessed_wav = encoder.preprocess_wav(wave_file_path)\n", - " file_embedding = encoder.embed_utterance(preprocessed_wav)\n", - " embeddings_dict[wav_file_name] = [file_embedding.reshape(-1).tolist(), speaker_id]\n", - " del file_embedding" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create and export speakers.json and aplly a L2_norm in embedding\n", - "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0]} for i, sample in enumerate(embeddings_dict.keys())}\n", - "save_speaker_mapping(SAVE_PATH, speaker_mapping)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#test load integrity\n", - "speaker_mapping_load = load_speaker_mapping(SAVE_PATH)\n", - "assert speaker_mapping == speaker_mapping_load\n", - "print(\"The file speakers.json has been exported to \",SAVE_PATH, ' with ', len(embeddings_dict.keys()), ' samples')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb deleted file mode 100644 index 09add419..00000000 --- a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb +++ /dev/null @@ -1,163 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a noteboook used to generate the speaker embeddings with the GE2E speaker encoder model for multi-speaker training.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related model files from TTS.\n", - "- download or clone related repos, linked below.\n", - "- setup the repositories. ```python setup.py install```\n", - "- to checkout right commit versions (given next to the model) of TTS.\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repository:\n", - "- TTS: https://github.com/coqui/TTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "import importlib\n", - "import random\n", - "import librosa\n", - "import torch\n", - "\n", - "import numpy as np\n", - "from tqdm import tqdm\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='0'\n", - "\n", - "\n", - "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.io import load_config" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You should also adjust all the path constants to point at the relevant locations for you locally" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MODEL_RUN_PATH = \"../../Coqui-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", - "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", - "\n", - "\n", - "DATASETS_NAME = ['vctk'] # list the datasets\n", - "DATASETS_PATH = ['../../../datasets/VCTK/']\n", - "DATASETS_METAFILE = ['']\n", - "\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Preprocess dataset\n", - "meta_data = []\n", - "for i in range(len(DATASETS_NAME)):\n", - " preprocessor = importlib.import_module('TTS.datasets.preprocess')\n", - " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", - " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", - " \n", - "meta_data= list(meta_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "c = load_config(CONFIG_PATH)\n", - "ap = AudioProcessor(**c['audio'])\n", - "\n", - "model = SpeakerEncoder(**c.model)\n", - "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n", - "model.eval()\n", - "if USE_CUDA:\n", - " model.cuda()\n", - "\n", - "embeddings_dict = {}\n", - "len_meta_data= len(meta_data)\n", - "\n", - "for i in tqdm(range(len_meta_data)):\n", - " _, wav_file, speaker_id = meta_data[i]\n", - " wav_file_name = os.path.basename(wav_file)\n", - " mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n", - " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", - " if USE_CUDA:\n", - " mel_spec = mel_spec.cuda()\n", - " embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", - " embeddings_dict[wav_file_name] = [embedd,speaker_id]\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# create and export speakers.json\n", - "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n", - "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#test load integrity\n", - "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n", - "assert speaker_mapping == speaker_mapping_load\n", - "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}