remove unused notebooks

2021-05-26 21:43:51 -03:00 · 2021-05-26 21:43:51 -03:00 · 5af505ff33
parent bc5307caa0
commit 5af505ff33
3 changed files with 0 additions and 538 deletions
--- a/ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/ExtractSpeakerEmbeddings-by-sample.ipynb
@ -1,163 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This is a noteboook used to generate the speaker embeddings with the AngleProto speaker encoder model for multi-speaker training.\n",
-    "\n",
-    "Before running this script please DON'T FORGET: \n",
-    "- to set file paths.\n",
-    "- to download related model files from TTS.\n",
-    "- download or clone related repos, linked below.\n",
-    "- setup the repositories. ```python setup.py install```\n",
-    "- to checkout right commit versions (given next to the model) of TTS.\n",
-    "- to set the right paths in the cell below.\n",
-    "\n",
-    "Repository:\n",
-    "- TTS: https://github.com/mozilla/TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "import os\n",
-    "import importlib\n",
-    "import random\n",
-    "import librosa\n",
-    "import torch\n",
-    "\n",
-    "import numpy as np\n",
-    "from tqdm import tqdm\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "\n",
-    "# you may need to change this depending on your system\n",
-    "os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
-    "\n",
-    "\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "from TTS.utils.audio import AudioProcessor\n",
-    "from TTS.utils.io import load_config"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You should also adjust all the path constants to point at the relevant locations for you locally"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "MODEL_RUN_PATH = \"../../Mozilla-TTS/checkpoints/libritts_100+360-angleproto-June-06-2020_04+12PM-9c04d1f/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
-    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
-    "\n",
-    "\n",
-    "DATASETS_NAME = ['vctk'] # list the datasets\n",
-    "DATASETS_PATH = ['../../../datasets/VCTK/']\n",
-    "DATASETS_METAFILE = ['']\n",
-    "\n",
-    "USE_CUDA = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Preprocess dataset\n",
-    "meta_data = []\n",
-    "for i in range(len(DATASETS_NAME)):\n",
-    "    preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
-    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
-    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
-    "      \n",
-    "meta_data= list(meta_data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "c = load_config(CONFIG_PATH)\n",
-    "ap = AudioProcessor(**c['audio'])\n",
-    "\n",
-    "model = SpeakerEncoder(**c.model)\n",
-    "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
-    "model.eval()\n",
-    "if USE_CUDA:\n",
-    "    model.cuda()\n",
-    "\n",
-    "embeddings_dict = {}\n",
-    "len_meta_data= len(meta_data)\n",
-    "\n",
-    "for i in tqdm(range(len_meta_data)):\n",
-    "    _, wav_file, speaker_id = meta_data[i]\n",
-    "    wav_file_name = os.path.basename(wav_file)\n",
-    "    mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
-    "    mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
-    "    if USE_CUDA:\n",
-    "        mel_spec = mel_spec.cuda()\n",
-    "    embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
-    "    embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create and export speakers.json\n",
-    "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
-    "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#test load integrity\n",
-    "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
-    "assert speaker_mapping == speaker_mapping_load\n",
-    "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/notebooks/GE2E-CorentinJ-ExtractSpeakerEmbeddings-by-sample.ipynb
@ -1,212 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This is a noteboook used to generate the speaker embeddings with the CorentinJ GE2E model trained with Angular Prototypical loss for multi-speaker training.\n",
-    "\n",
-    "Before running this script please DON'T FORGET:\n",
-    "- to set the right paths in the cell below.\n",
-    "\n",
-    "Repositories:\n",
-    "- TTS: https://github.com/coqui/TTS\n",
-    "- CorentinJ GE2E: https://github.com/Edresson/GE2E-Speaker-Encoder"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import importlib\n",
-    "import random\n",
-    "import librosa\n",
-    "import torch\n",
-    "\n",
-    "import numpy as np\n",
-    "from TTS.utils.io import load_config\n",
-    "from tqdm import tqdm\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "\n",
-    "# you may need to change this depending on your system\n",
-    "os.environ['CUDA_VISIBLE_DEVICES']='0'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Clone encoder \n",
-    "!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n",
-    "os.chdir('Real-Time-Voice-Cloning/')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Install voxceleb_trainer Requeriments\n",
-    "!python -m pip install umap-learn visdom webrtcvad librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0  scipy>=1.0.0  tqdm sounddevice Unidecode inflect multiprocess numba"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Download encoder Checkpoint\n",
-    "!wget https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n",
-    "!unzip pretrained.zip"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from encoder import inference as encoder\n",
-    "from encoder.params_model import model_embedding_size as speaker_embedding_size\n",
-    "from pathlib import Path"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(\"Preparing the encoder, the synthesizer and the vocoder...\")\n",
-    "encoder.load_model(Path('encoder/saved_models/pretrained.pt'))\n",
-    "print(\"Testing your configuration with small inputs.\")\n",
-    "# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's\n",
-    "# sampling rate, which may differ.\n",
-    "# If you're unfamiliar with digital audio, know that it is encoded as an array of floats \n",
-    "# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.\n",
-    "# The sampling rate is the number of values (samples) recorded per second, it is set to\n",
-    "# 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond \n",
-    "# to an audio of 1 second.\n",
-    "print(\"\\tTesting the encoder...\")\n",
-    "\n",
-    "wav = np.zeros(encoder.sampling_rate)    \n",
-    "embed = encoder.embed_utterance(wav)\n",
-    "print(embed.shape)\n",
-    "\n",
-    "# Embeddings are L2-normalized (this isn't important here, but if you want to make your own \n",
-    "# embeddings it will be).\n",
-    "#embed /= np.linalg.norm(embed) # for random embedding\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "SAVE_PATH = '../'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Set constants\n",
-    "DATASETS_NAME = ['vctk'] # list the datasets\n",
-    "DATASETS_PATH = ['../../../../../datasets/VCTK-Corpus-removed-silence/']\n",
-    "DATASETS_METAFILE = ['']\n",
-    "USE_CUDA = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Preprocess dataset\n",
-    "meta_data = []\n",
-    "for i in range(len(DATASETS_NAME)):\n",
-    "    preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n",
-    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
-    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
-    "      \n",
-    "meta_data= list(meta_data)\n",
-    "\n",
-    "meta_data = meta_data\n",
-    "embeddings_dict = {}\n",
-    "len_meta_data= len(meta_data)\n",
-    "for i in tqdm(range(len_meta_data)):\n",
-    "    _, wave_file_path, speaker_id = meta_data[i]\n",
-    "    wav_file_name = os.path.basename(wave_file_path)\n",
-    "    # Extract Embedding\n",
-    "    preprocessed_wav = encoder.preprocess_wav(wave_file_path)\n",
-    "    file_embedding = encoder.embed_utterance(preprocessed_wav)\n",
-    "    embeddings_dict[wav_file_name] = [file_embedding.reshape(-1).tolist(), speaker_id]\n",
-    "    del file_embedding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create and export speakers.json  and aplly a L2_norm in embedding\n",
-    "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0]} for i, sample in enumerate(embeddings_dict.keys())}\n",
-    "save_speaker_mapping(SAVE_PATH, speaker_mapping)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#test load integrity\n",
-    "speaker_mapping_load = load_speaker_mapping(SAVE_PATH)\n",
-    "assert speaker_mapping == speaker_mapping_load\n",
-    "print(\"The file speakers.json has been exported to \",SAVE_PATH, ' with ', len(embeddings_dict.keys()), ' samples')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
--- a/ExtractSpeakerEmbeddings-by-sample.ipynb
+++ b/ExtractSpeakerEmbeddings-by-sample.ipynb
@ -1,163 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This is a noteboook used to generate the speaker embeddings with the GE2E speaker encoder model for multi-speaker training.\n",
-    "\n",
-    "Before running this script please DON'T FORGET: \n",
-    "- to set file paths.\n",
-    "- to download related model files from TTS.\n",
-    "- download or clone related repos, linked below.\n",
-    "- setup the repositories. ```python setup.py install```\n",
-    "- to checkout right commit versions (given next to the model) of TTS.\n",
-    "- to set the right paths in the cell below.\n",
-    "\n",
-    "Repository:\n",
-    "- TTS: https://github.com/coqui/TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2\n",
-    "import os\n",
-    "import importlib\n",
-    "import random\n",
-    "import librosa\n",
-    "import torch\n",
-    "\n",
-    "import numpy as np\n",
-    "from tqdm import tqdm\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "\n",
-    "# you may need to change this depending on your system\n",
-    "os.environ['CUDA_VISIBLE_DEVICES']='0'\n",
-    "\n",
-    "\n",
-    "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n",
-    "from TTS.utils.audio import AudioProcessor\n",
-    "from TTS.utils.io import load_config"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You should also adjust all the path constants to point at the relevant locations for you locally"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "MODEL_RUN_PATH = \"../../Coqui-TTS/checkpoints/libritts_360-half-September-28-2019_10+46AM-8565c50-20200323T115637Z-001/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
-    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
-    "\n",
-    "\n",
-    "DATASETS_NAME = ['vctk'] # list the datasets\n",
-    "DATASETS_PATH = ['../../../datasets/VCTK/']\n",
-    "DATASETS_METAFILE = ['']\n",
-    "\n",
-    "USE_CUDA = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Preprocess dataset\n",
-    "meta_data = []\n",
-    "for i in range(len(DATASETS_NAME)):\n",
-    "    preprocessor = importlib.import_module('TTS.datasets.preprocess')\n",
-    "    preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n",
-    "    meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n",
-    "      \n",
-    "meta_data= list(meta_data)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "c = load_config(CONFIG_PATH)\n",
-    "ap = AudioProcessor(**c['audio'])\n",
-    "\n",
-    "model = SpeakerEncoder(**c.model)\n",
-    "model.load_state_dict(torch.load(MODEL_PATH)['model'])\n",
-    "model.eval()\n",
-    "if USE_CUDA:\n",
-    "    model.cuda()\n",
-    "\n",
-    "embeddings_dict = {}\n",
-    "len_meta_data= len(meta_data)\n",
-    "\n",
-    "for i in tqdm(range(len_meta_data)):\n",
-    "    _, wav_file, speaker_id = meta_data[i]\n",
-    "    wav_file_name = os.path.basename(wav_file)\n",
-    "    mel_spec = ap.melspectrogram(ap.load_wav(wav_file)).T\n",
-    "    mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n",
-    "    if USE_CUDA:\n",
-    "        mel_spec = mel_spec.cuda()\n",
-    "    embedd = model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n",
-    "    embeddings_dict[wav_file_name] = [embedd,speaker_id]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create and export speakers.json\n",
-    "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0].reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}\n",
-    "save_speaker_mapping(MODEL_RUN_PATH, speaker_mapping)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#test load integrity\n",
-    "speaker_mapping_load = load_speaker_mapping(MODEL_RUN_PATH)\n",
-    "assert speaker_mapping == speaker_mapping_load\n",
-    "print(\"The file speakers.json has been exported to \",MODEL_RUN_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}