From 9d28f311689018e81998355f029f520890a19a6f Mon Sep 17 00:00:00 2001 From: Edresson Date: Thu, 30 Jul 2020 23:24:37 -0300 Subject: [PATCH] add Colab Notebook from TTS Multi-Speaker with GST --- ...MultiSpeaker_jia_et_al_2018_With_GST.ipynb | 834 ++++++++++++++++++ 1 file changed, 834 insertions(+) create mode 100755 notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb diff --git a/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb new file mode 100755 index 00000000..e059461e --- /dev/null +++ b/notebooks/Demo_Mozilla_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb @@ -0,0 +1,834 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018-With-GST.ipynb", + "provenance": [], + "collapsed_sections": [ + "yZK6UdwSFnOO", + "ENA2OumIVeMA", + "dV6cXXlfi72r", + "vnV-FigfvsS2", + "g_G_HweN04W-", + "LEE6mQLh5Who" + ], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "yZK6UdwSFnOO", + "colab_type": "text" + }, + "source": [ + "# **Download and install Mozilla TTS**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yvb0pX3WY6MN", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import os \n", + "!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "iB9nl2UEG3SY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!apt-get install espeak\n", + "os.chdir('TTS')\n", + "!pip install -r requirements.txt\n", + "!python setup.py develop\n", + "os.chdir('..')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w6Krn8k1inC_", + "colab_type": "text" + }, + "source": [ + "\n", + "\n", + "**Download Checkpoint**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "PiYHf3lKhi9z", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018-with-GST.zip\n", + "!unzip ./TTS-checkpoint.zip\n", + "\n", + "# Download gst style example\n", + "!wget https://github.com/Edresson/TTS/releases/download/v1.0.0/gst-style-example.wav" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MpYNgqrZcJKn", + "colab_type": "text" + }, + "source": [ + "**Utils Functions**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4KZA4b_CbMqx", + "colab_type": "code", + "colab": {} + }, + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "import argparse\n", + "import json\n", + "# pylint: disable=redefined-outer-name, unused-argument\n", + "import os\n", + "import string\n", + "import time\n", + "import sys\n", + "import numpy as np\n", + "\n", + "TTS_PATH = \"../content/TTS\"\n", + "# add libraries into environment\n", + "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", + "\n", + "import torch\n", + "\n", + "from TTS.tts.utils.generic_utils import setup_model\n", + "from TTS.tts.utils.synthesis import synthesis\n", + "from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n", + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.utils.io import load_config\n", + "from TTS.vocoder.utils.generic_utils import setup_generator\n", + "\n", + "\n", + "def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):\n", + " t_1 = time.time()\n", + " waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n", + " if CONFIG.model == \"Tacotron\" and not use_gl:\n", + " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", + " if not use_gl:\n", + " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", + " if use_cuda and not use_gl:\n", + " waveform = waveform.cpu()\n", + " if not use_gl:\n", + " waveform = waveform.numpy()\n", + " waveform = waveform.squeeze()\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " tps = (time.time() - t_1) / len(waveform)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", + " print(\" > Time per step: {}\".format(tps))\n", + " return waveform\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ENA2OumIVeMA", + "colab_type": "text" + }, + "source": [ + "# **Vars definitions**\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "jPD0d_XpVXmY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "TEXT = ''\n", + "OUT_PATH = 'tests-audios/'\n", + "# create output path\n", + "os.makedirs(OUT_PATH, exist_ok=True)\n", + "\n", + "SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n", + "\n", + "# model vars \n", + "MODEL_PATH = 'best_model.pth.tar'\n", + "CONFIG_PATH = 'config.json'\n", + "SPEAKER_JSON = 'speakers.json'\n", + "\n", + "# vocoder vars\n", + "VOCODER_PATH = ''\n", + "VOCODER_CONFIG_PATH = ''\n", + "\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dV6cXXlfi72r", + "colab_type": "text" + }, + "source": [ + "# **Restore TTS Model**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "x1WgLFauWUPe", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# load the config\n", + "C = load_config(CONFIG_PATH)\n", + "C.forward_attn_mask = True\n", + "\n", + "# load the audio processor\n", + "ap = AudioProcessor(**C.audio)\n", + "\n", + "# if the vocabulary was passed, replace the default\n", + "if 'characters' in C.keys():\n", + " symbols, phonemes = make_symbols(**C.characters)\n", + "\n", + "speaker_embedding = None\n", + "speaker_embedding_dim = None\n", + "num_speakers = 0\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " num_speakers = len(speaker_mapping)\n", + " if C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID is not None:\n", + " speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n", + " else: # if speaker_fileid is not specificated use the first sample in speakers.json\n", + " choise_speaker = list(speaker_mapping.keys())[0]\n", + " print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n", + " speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n", + " speaker_embedding_dim = len(speaker_embedding)\n", + "\n", + "# load the model\n", + "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", + "model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n", + "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", + "model.load_state_dict(cp['model'])\n", + "model.eval()\n", + "\n", + "if USE_CUDA:\n", + " model.cuda()\n", + "\n", + "model.decoder.set_r(cp['r'])\n", + "\n", + "# load vocoder model\n", + "if VOCODER_PATH!= \"\":\n", + " VC = load_config(VOCODER_CONFIG_PATH)\n", + " vocoder_model = setup_generator(VC)\n", + " vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n", + " vocoder_model.remove_weight_norm()\n", + " if USE_CUDA:\n", + " vocoder_model.cuda()\n", + " vocoder_model.eval()\n", + "else:\n", + " vocoder_model = None\n", + " VC = None\n", + "\n", + "# synthesize voice\n", + "use_griffin_lim = VOCODER_PATH== \"\"\n", + "\n", + "if not C.use_external_speaker_embedding_file:\n", + " if SPEAKER_FILEID.isdigit():\n", + " SPEAKER_FILEID = int(SPEAKER_FILEID)\n", + " else:\n", + " SPEAKER_FILEID = None\n", + "else:\n", + " SPEAKER_FILEID = None\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tNvVEoE30qY6", + "colab_type": "text" + }, + "source": [ + "Synthesize sentence with Speaker\n", + "\n", + "> Stop running the cell to leave!\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2o8fXkVSyXOa", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n", + "gst_style = 'gst-style-example.wav'\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vnV-FigfvsS2", + "colab_type": "text" + }, + "source": [ + "# **Select Speaker**\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "RuCGOnJ_fgDV", + "colab_type": "code", + "colab": {} + }, + "source": [ + "\n", + "# VCTK speakers not seen in training (new speakers)\n", + "VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n", + "\n", + "# VCTK speakers seen in training\n", + "VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n", + "\n", + "\n", + "num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hkvv7gRcx4WV", + "colab_type": "text" + }, + "source": [ + "## **Example select a VCTK seen speaker in training**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BviNMI9UyCYz", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "5e5_XnLsx3jg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n", + "gst_style = 'gst-style-example.wav'\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "QJ6VgT2a4vHW" + }, + "source": [ + "## **Example select a VCTK not seen speaker in training (new Speakers)**\n", + "\n", + "\n", + "> Fitting new Speakers :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "SZS57ZK-4vHa", + "colab": {} + }, + "source": [ + "# get embedding\n", + "Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n", + "# load speakers\n", + "if SPEAKER_JSON != '':\n", + " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", + " if C.use_external_speaker_embedding_file:\n", + " speaker_embeddings = []\n", + " for key in list(speaker_mapping.keys()):\n", + " if Speaker_choise in key:\n", + " if len(speaker_embeddings) < num_samples_speaker:\n", + " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", + " # takes the average of the embedings samples of the announcers\n", + " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", + " " + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "bbs85vzz4vHo", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "gst_style = 'gst-style-example.wav'\n", + "while True:\n", + " TEXT = input(\"Enter sentence: \")\n", + " print(\" > Text: {}\".format(TEXT))\n", + " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + " # save the results\n", + " file_name = TEXT.replace(\" \", \"_\")\n", + " file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + " out_path = os.path.join(OUT_PATH, file_name)\n", + " print(\" > Saving output to {}\".format(out_path))\n", + " ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "g_G_HweN04W-", + "colab_type": "text" + }, + "source": [ + "# **Changing GST tokens manually (without wav reference)**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jyFP5syW2bjt", + "colab_type": "text" + }, + "source": [ + "You can define tokens manually, this way you can increase/decrease the function of a given GST token. For example a token is responsible for the length of the speaker's pauses, if you increase the value of that token you will have longer pauses and if you decrease it you will have shorter pauses." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "SpwjDjCM2a3Y", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# set gst tokens, in this model we have 5 tokens\n", + "gst_style = {\"0\": 0, \"1\": 0, \"3\": 0, \"4\": 0}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "qWChMbI_0z5X", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "uFjUi9xQ3mG3", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gst_style = {\"0\": 0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Uw0d6gWg4L27", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gst_style = {\"0\": -0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "V9izw4-54-Tl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gst_style = {\"0\": 0, \"1\": 0.9, \"3\": 0, \"4\": 0}\n", + "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LEE6mQLh5Who" + }, + "source": [ + "# **Example Synthesizing with your own voice :)**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "La70gSB65nrs", + "colab_type": "text" + }, + "source": [ + " Download and load GE2E Speaker Encoder " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "r0IEFZ0B5vQg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n", + "!unzip ./SpeakerEncoder-checkpoint.zip" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jEH8HCTh5mF6", + "colab_type": "code", + "colab": {} + }, + "source": [ + "SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n", + "SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n", + "SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n", + "USE_CUDA = True" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "tOwkfQqT6-Qo", + "colab_type": "code", + "colab": {} + }, + "source": [ + "from TTS.utils.audio import AudioProcessor\n", + "from TTS.speaker_encoder.model import SpeakerEncoder\n", + "se_config = load_config(SE_CONFIG_PATH)\n", + "se_ap = AudioProcessor(**se_config['audio'])\n", + "\n", + "se_model = SpeakerEncoder(**se_config.model)\n", + "se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n", + "se_model.eval()\n", + "if USE_CUDA:\n", + " se_model.cuda()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0TLlbUFG8O36", + "colab_type": "text" + }, + "source": [ + "Upload one or more wav audio files in your voice.\n", + "\n", + "\n", + "> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_FWwHPjJ8NXl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# select one or more wav files\n", + "from google.colab import files\n", + "file_list = files.upload()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "WWOf6sgbBbGY", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# extract embedding from wav files\n", + "speaker_embeddings = []\n", + "for name in file_list.keys():\n", + " if '.wav' in name:\n", + " mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n", + " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", + " if USE_CUDA:\n", + " mel_spec = mel_spec.cuda()\n", + " embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", + " speaker_embeddings.append(embedd)\n", + " else:\n", + " print(\"You need upload Wav files, others files is not supported !!\")\n", + "\n", + "# takes the average of the embedings samples of the announcers\n", + "speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "AQ7eP31d9yzq", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import IPython\n", + "from IPython.display import Audio\n", + "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", + "gst_style = {\"0\": 0, \"1\": 0.0, \"3\": 0, \"4\": 0}\n", + "gst_style = 'gst-style-example.wav'\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "11i10yE1-LMJ", + "colab_type": "text" + }, + "source": [ + "Uploading your own GST reference wav file" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "eKohSQG1-KkT", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# select one wav file for GST reference\n", + "from google.colab import files\n", + "file_list = files.upload()\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab_type": "code", + "id": "xmItcGac5WiG", + "colab": {} + }, + "source": [ + "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", + "gst_style = list(file_list.keys())[0]\n", + "TEXT = input(\"Enter sentence: \")\n", + "print(\" > Text: {}\".format(TEXT))\n", + "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", + "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", + "# save the results\n", + "file_name = TEXT.replace(\" \", \"_\")\n", + "file_name = file_name.translate(\n", + " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", + "out_path = os.path.join(OUT_PATH, file_name)\n", + "print(\" > Saving output to {}\".format(out_path))\n", + "ap.save_wav(wav, out_path)" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file