From ad570f3cab816bd5245a15788b890a72dd11b8a5 Mon Sep 17 00:00:00 2001
From: Edresson <edresson1@gmail.com>
Date: Thu, 30 Jul 2020 23:29:27 -0300
Subject: [PATCH] add Jupyter Notebook for Extract Speaker Embedding per sample
 using GE2E

---
 ...- ExtractSpeakerEmbeddings-by-sample.ipynb | 163 ++++++++++++++++++
 1 file changed, 163 insertions(+)
 create mode 100644 notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb

diff --git a/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb
new file mode 100644
index 00000000..612ec146
--- /dev/null
+++ b/notebooks/GE2E-Speaker_Encoder- ExtractSpeakerEmbeddings-by-sample.ipynb	
@@ -0,0 +1,163 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is a noteboook used to generate the speaker embeddings with the GE2E speaker encoder model for multi-speaker training.\n",
+    "\n",
+    "Before running this script please DON'T FORGET: \n",
+    "- to set file paths.\n",
+    "- to download related model files from TTS.\n",
+    "- download or clone related repos, linked below.\n",
+    "- setup the repositories. ```python setup.py install```\n",
+    "- to checkout right commit versions (given next to the model) of TTS.\n",
+    "- to set the right paths in the cell below.\n",
+    "\n",
+    "Repository:\n",
+    "- TTS: https://github.com/mozilla/TTS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "import os\n",
+    "import importlib\n",
+    "import random\n",
+    "import librosa\n",
+    "import torch\n",
+    "\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm\n",
+    "from TTS.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
+    "\n",
+    "# you may need to change this depending on your system\n",
+    "os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
+    "\n",
+    "\n",
+    "from TTS.speaker_encoder.model import SpeakerEncoder\n",
+    "from TTS.utils.audio import AudioProcessor\n",
+    "from TTS.utils.generic_utils import load_config"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should also adjust all the path constants to point at the relevant locations for you locally"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/\"\n",
+    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
+    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
+    "\n",
+    "\n",
+    "DATASETS_NAME = ['brspeech'] # list the datasets\n",
+    "DATASETS_PATH = ['../../../datasets/BRSpeech-2.0-beta8']\n",
+    "DATASETS_METAFILE = ['TTS_metadata_brspeech2+cv_all_valited_lines.csv']\n",
+    "\n",
+    "USE_CUDA = True"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Preprocess dataset\n",
+    "meta_data = []\n",
+    "for i in range(len(DATASETS_NAME)):\n",
+    "    preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
+    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
+    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
+    "      \n",
+    "meta_data= list(meta_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = load_config(CONFIG_PATH)\n",
+    "ap = AudioProcessor(**c['audio'])\n",
+    "\n",
+    "model = SpeakerEncoder(**c.model)\n",
+    "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
+    "model.eval()\n",
+    "if USE_CUDA:\n",
+    "    model.cuda()\n",
+    "\n",
+    "embeddings_dict = {}\n",
+    "len_meta_data= len(meta_data)\n",
+    "\n",
+    "for i in tqdm(range(len_meta_data)):\n",
+    "    _, wav_file, speaker_id = meta_data[i]\n",
+    "    wav_file_name = os.path.basename(wav_file)\n",
+    "    mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
+    "    mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
+    "    if USE_CUDA:\n",
+    "        mel_spec = mel_spec.cuda()\n",
+    "    embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
+    "    embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create and export speakers.json\n",
+    "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
+    "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#test load integrity\n",
+    "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
+    "assert speaker_mapping == speaker_mapping_load\n",
+    "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}