{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "This is a noteboook used to generate the speaker embeddings with the CorentinJ GE2E model trained with Angular Prototypical loss for multi-speaker training.\n", "\n", "Before running this script please DON'T FORGET:\n", "- to set the right paths in the cell below.\n", "\n", "Repositories:\n", "- TTS: https://github.com/coqui/TTS\n", "- CorentinJ GE2E: https://github.com/Edresson/GE2E-Speaker-Encoder" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "import os\n", "import importlib\n", "import random\n", "import librosa\n", "import torch\n", "\n", "import numpy as np\n", "from TTS.utils.io import load_config\n", "from tqdm import tqdm\n", "from TTS.tts.utils.speakers import save_speaker_mapping, load_speaker_mapping\n", "\n", "# you may need to change this depending on your system\n", "os.environ['CUDA_VISIBLE_DEVICES']='0'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Clone encoder \n", "!git clone https://github.com/CorentinJ/Real-Time-Voice-Cloning.git\n", "os.chdir('Real-Time-Voice-Cloning/')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Install voxceleb_trainer Requeriments\n", "!python -m pip install umap-learn visdom webrtcvad librosa>=0.5.1 matplotlib>=2.0.2 numpy>=1.14.0 scipy>=1.0.0 tqdm sounddevice Unidecode inflect multiprocess numba" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Download encoder Checkpoint\n", "!wget https://github.com/Edresson/Real-Time-Voice-Cloning/releases/download/checkpoints/pretrained.zip\n", "!unzip pretrained.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from encoder import inference as encoder\n", "from encoder.params_model import model_embedding_size as speaker_embedding_size\n", "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Preparing the encoder, the synthesizer and the vocoder...\")\n", "encoder.load_model(Path('encoder/saved_models/pretrained.pt'))\n", "print(\"Testing your configuration with small inputs.\")\n", "# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's\n", "# sampling rate, which may differ.\n", "# If you're unfamiliar with digital audio, know that it is encoded as an array of floats \n", "# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.\n", "# The sampling rate is the number of values (samples) recorded per second, it is set to\n", "# 16000 for the encoder. Creating an array of length will always correspond \n", "# to an audio of 1 second.\n", "print(\"\\tTesting the encoder...\")\n", "\n", "wav = np.zeros(encoder.sampling_rate) \n", "embed = encoder.embed_utterance(wav)\n", "print(embed.shape)\n", "\n", "# Embeddings are L2-normalized (this isn't important here, but if you want to make your own \n", "# embeddings it will be).\n", "#embed /= np.linalg.norm(embed) # for random embedding\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "SAVE_PATH = '../'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set constants\n", "DATASETS_NAME = ['vctk'] # list the datasets\n", "DATASETS_PATH = ['../../../../../datasets/VCTK-Corpus-removed-silence/']\n", "DATASETS_METAFILE = ['']\n", "USE_CUDA = True" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Preprocess dataset\n", "meta_data = []\n", "for i in range(len(DATASETS_NAME)):\n", " preprocessor = importlib.import_module('TTS.tts.datasets.preprocess')\n", " preprocessor = getattr(preprocessor, DATASETS_NAME[i].lower())\n", " meta_data += preprocessor(DATASETS_PATH[i],DATASETS_METAFILE[i])\n", " \n", "meta_data= list(meta_data)\n", "\n", "meta_data = meta_data\n", "embeddings_dict = {}\n", "len_meta_data= len(meta_data)\n", "for i in tqdm(range(len_meta_data)):\n", " _, wave_file_path, speaker_id = meta_data[i]\n", " wav_file_name = os.path.basename(wave_file_path)\n", " # Extract Embedding\n", " preprocessed_wav = encoder.preprocess_wav(wave_file_path)\n", " file_embedding = encoder.embed_utterance(preprocessed_wav)\n", " embeddings_dict[wav_file_name] = [file_embedding.reshape(-1).tolist(), speaker_id]\n", " del file_embedding" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and export speakers.json and aplly a L2_norm in embedding\n", "speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':embeddings_dict[sample][0]} for i, sample in enumerate(embeddings_dict.keys())}\n", "save_speaker_mapping(SAVE_PATH, speaker_mapping)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#test load integrity\n", "speaker_mapping_load = load_speaker_mapping(SAVE_PATH)\n", "assert speaker_mapping == speaker_mapping_load\n", "print(\"The file speakers.json has been exported to \",SAVE_PATH, ' with ', len(embeddings_dict.keys()), ' samples')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }