From d7fae3f5157c71016b08816dc7e9ca33f4d9ac20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 18 May 2021 15:07:25 +0200 Subject: [PATCH] remove all espeaker and phonemizer deps --- .compute | 1 - Makefile | 1 - TTS/server/README.md | 39 - TTS/tts/utils/text/__init__.py | 48 +- hubconf.py | 2 +- ..._and_MultiBand_MelGAN_TFLite_Example.ipynb | 387 -------- ...oqui_TTS_MultiSpeaker_jia_et_al_2018.ipynb | 650 -------------- ...MultiSpeaker_jia_et_al_2018_With_GST.ipynb | 847 ------------------ requirements.txt | 1 - 9 files changed, 3 insertions(+), 1973 deletions(-) delete mode 100644 notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb delete mode 100644 notebooks/Demo_Coqui_TTS_MultiSpeaker_jia_et_al_2018.ipynb delete mode 100644 notebooks/Demo_Coqui_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb diff --git a/.compute b/.compute index cda787d2..9786a689 100644 --- a/.compute +++ b/.compute @@ -1,7 +1,6 @@ #!/bin/bash yes | apt-get install sox yes | apt-get install ffmpeg -yes | apt-get install espeak yes | apt-get install tmux yes | apt-get install zsh sh -c "$(curl -fsSL https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)" diff --git a/Makefile b/Makefile index 2210a682..4dc2d588 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,6 @@ help: target_dirs := tests TTS notebooks system-deps: ## install linux system deps - sudo apt-get install -y espeak-ng sudo apt-get install -y libsndfile1-dev dev-deps: ## install development deps diff --git a/TTS/server/README.md b/TTS/server/README.md index 51cedc05..89ee21eb 100644 --- a/TTS/server/README.md +++ b/TTS/server/README.md @@ -22,42 +22,3 @@ Run the server with the official models on a GPU. Run the server with a custom models. ```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json``` - - - - - - diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 6c193ff5..2b73d4e4 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -2,9 +2,7 @@ import re -import phonemizer from packaging import version -from phonemizer.phonemize import phonemize from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes from TTS.tts.utils.text import cleaners @@ -28,9 +26,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+" def text2phone(text, language): - """Convert graphemes to phonemes. For most of the languages, it calls - the phonemizer python library that calls espeak/espeak-ng. For chinese - mandarin, it calls pypinyin + custom function for phonemizing + """Convert graphemes to phonemes. Parameters: text (str): text to phonemize language (str): language of the text @@ -43,47 +39,7 @@ def text2phone(text, language): if language == "zh-CN": ph = chinese_text_to_phonemes(text) return ph - - seperator = phonemizer.separator.Separator(" |", "", "|") - # try: - punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text) - if version.parse(phonemizer.__version__) < version.parse("2.1"): - ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend="espeak", language=language) - ph = ph[:-1].strip() # skip the last empty character - # phonemizer does not tackle punctuations. Here we do. - # Replace \n with matching punctuations. - if punctuations: - # if text ends with a punctuation. - if text[-1] == punctuations[-1]: - for punct in punctuations[:-1]: - ph = ph.replace("| |\n", "|" + punct + "| |", 1) - ph = ph + punctuations[-1] - else: - for punct in punctuations: - ph = ph.replace("| |\n", "|" + punct + "| |", 1) - elif version.parse(phonemizer.__version__) >= version.parse("2.1"): - ph = phonemize( - text, - separator=seperator, - strip=False, - njobs=1, - backend="espeak", - language=language, - preserve_punctuation=True, - language_switch="remove-flags", - ) - # this is a simple fix for phonemizer. - # https://github.com/bootphon/phonemizer/issues/32 - if punctuations: - for punctuation in punctuations: - ph = ph.replace(f"| |{punctuation} ", f"|{punctuation}| |").replace( - f"| |{punctuation}", f"|{punctuation}| |" - ) - ph = ph[:-3] - else: - raise RuntimeError(" [!] Use 'phonemizer' version 2.1 or older.") - - return ph + raise ValueError(f" [!] Language {language} is nor supported for phonemization.") def intersperse(sequence, token): diff --git a/hubconf.py b/hubconf.py index 152374c8..bcbd6fce 100644 --- a/hubconf.py +++ b/hubconf.py @@ -1,6 +1,6 @@ dependencies = [ 'torch', 'gdown', 'pysbd', 'phonemizer', 'unidecode', 'pypinyin' -] # apt install espeak-ng +] import torch from TTS.utils.manage import ModelManager diff --git a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb b/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb deleted file mode 100644 index c39cc53e..00000000 --- a/notebooks/DDC_TTS_and_MultiBand_MelGAN_TFLite_Example.ipynb +++ /dev/null @@ -1,387 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "6LWsNd3_M3MP" - }, - "source": [ - "# Mozilla TTS on CPU Real-Time Speech Synthesis with TFLite" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "FAqrSIWgLyP0" - }, - "source": [ - "**These models are converted from released [PyTorch models](https://colab.research.google.com/drive/1u_16ZzHjKYFn1HNVuA4Qf_i2MMFB9olY?usp=sharing) using our TF utilities provided in Mozilla TTS.**\n", - "\n", - "#### **Notebook Details**\n", - "These TFLite models support TF 2.3rc0 and for different versions you might need to regenerate them. \n", - "\n", - "TFLite optimizations degrades the TTS model performance and we do not apply\n", - "any optimization for the vocoder model due to the same reason. If you like to\n", - "keep the quality, consider to regenerate TFLite model accordingly.\n", - "\n", - "Models optimized with TFLite can be slow on a regular CPU since it is optimized\n", - "specifically for lower-end systems.\n", - "\n", - "---\n", - "\n", - "\n", - "\n", - "#### **Model Details** \n", - "We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.\n", - "\n", - "Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.\n", - "\n", - "MultiBand-Melgan is trained 1.45M steps with real spectrograms.\n", - "\n", - "Note that both model performances can be improved with more training.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ku-dA4DKoeXk" - }, - "source": [ - "### Download TF Models and configs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 162 - }, - "colab_type": "code", - "id": "jGIgnWhGsxU1", - "outputId": "57af701e-77ec-400d-fee5-64aa7603d357" - }, - "outputs": [], - "source": [ - "!gdown --id 17PYXCmTe0el_SLTwznrt3vOArNGMGo5v -O tts_model.tflite\n", - "!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O config.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 235 - }, - "colab_type": "code", - "id": "4dnpE0-kvTsu", - "outputId": "6aab0622-9add-4ee4-b9f8-177d6ddc0e86" - }, - "outputs": [], - "source": [ - "!gdown --id 1aXveT-NjOM1mUr6tM4JfWjshq67GvVIO -O vocoder_model.tflite\n", - "!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O config_vocoder.json\n", - "!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O scale_stats.npy" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "_ZuDrj_ioqHE" - }, - "source": [ - "### Setup Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 964 - }, - "colab_type": "code", - "id": "X2axt5BYq7gv", - "outputId": "aa53986f-f218-4d17-8667-0d74bb90c927" - }, - "outputs": [], - "source": [ - "# need it for char to phoneme conversion\n", - "! sudo apt-get install espeak" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 144 - }, - "colab_type": "code", - "id": "ZduAf-qYYEIT", - "outputId": "c1fcac0d-b8f8-442c-d598-4f549c42b698" - }, - "outputs": [], - "source": [ - "!git clone https://github.com/mozilla/TTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "colab_type": "code", - "id": "ofPCvPyjZEcT", - "outputId": "f3d3ea73-eae5-473c-db19-276bd0e721cc" - }, - "outputs": [], - "source": [ - "%cd TTS\n", - "!git checkout c7296b3\n", - "!pip install -r requirements.txt\n", - "!python setup.py install\n", - "!pip install tensorflow==2.3.0rc0\n", - "%cd .." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Zlgi8fPdpRF0" - }, - "source": [ - "### Define TTS function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "f-Yc42nQZG5A" - }, - "outputs": [], - "source": [ - "def run_vocoder(mel_spec):\n", - " vocoder_inputs = mel_spec[None, :, :]\n", - " # get input and output details\n", - " input_details = vocoder_model.get_input_details()\n", - " # reshape input tensor for the new input shape\n", - " vocoder_model.resize_tensor_input(input_details[0]['index'], vocoder_inputs.shape)\n", - " vocoder_model.allocate_tensors()\n", - " detail = input_details[0]\n", - " vocoder_model.set_tensor(detail['index'], vocoder_inputs)\n", - " # run the model\n", - " vocoder_model.invoke()\n", - " # collect outputs\n", - " output_details = vocoder_model.get_output_details()\n", - " waveform = vocoder_model.get_tensor(output_details[0]['index'])\n", - " return waveform \n", - "\n", - "\n", - "def tts(model, text, CONFIG, p):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None,\n", - " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", - " backend='tflite')\n", - " waveform = run_vocoder(mel_postnet_spec.T)\n", - " waveform = waveform[0, 0]\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " tps = (time.time() - t_1) / len(waveform)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " print(\" > Time per step: {}\".format(tps))\n", - " IPython.display.display(IPython.display.Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ZksegYQepkFg" - }, - "source": [ - "### Load TF Models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "oVa0kOamprgj" - }, - "outputs": [], - "source": [ - "import os\n", - "import torch\n", - "import time\n", - "import IPython\n", - "\n", - "from TTS.tf.utils.tflite import load_tflite_model\n", - "from TTS.tf.utils.io import load_checkpoint\n", - "from TTS.utils.io import load_config\n", - "from TTS.utils.text.symbols import symbols, phonemes\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.tts.utils.synthesis import synthesis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "EY-sHVO8IFSH" - }, - "outputs": [], - "source": [ - "# runtime settings\n", - "use_cuda = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_1aIUp2FpxOQ" - }, - "outputs": [], - "source": [ - "# model paths\n", - "TTS_MODEL = \"tts_model.tflite\"\n", - "TTS_CONFIG = \"config.json\"\n", - "VOCODER_MODEL = \"vocoder_model.tflite\"\n", - "VOCODER_CONFIG = \"config_vocoder.json\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CpgmdBVQplbv" - }, - "outputs": [], - "source": [ - "# load configs\n", - "TTS_CONFIG = load_config(TTS_CONFIG)\n", - "VOCODER_CONFIG = load_config(VOCODER_CONFIG)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 471 - }, - "colab_type": "code", - "id": "zmrQxiozIUVE", - "outputId": "ca7e9016-4c28-4cef-efe7-0613d399aa4c" - }, - "outputs": [], - "source": [ - "# load the audio processor\n", - "ap = AudioProcessor(**TTS_CONFIG.audio) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "8fLoI4ipqMeS" - }, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "# multi speaker \n", - "speaker_id = None\n", - "speakers = []\n", - "\n", - "# load the models\n", - "model = load_tflite_model(TTS_MODEL)\n", - "vocoder_model = load_tflite_model(VOCODER_MODEL)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "Ws_YkPKsLgo-" - }, - "source": [ - "## Run Inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 134 - }, - "colab_type": "code", - "id": "FuWxZ9Ey5Puj", - "outputId": "d1888ebd-3208-42a4-aaf9-78d0e3ec987d" - }, - "outputs": [], - "source": [ - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, TTS_CONFIG, ap)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "name": "DDC-TTS_and_MultiBand-MelGAN_TFLite_Example.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/Demo_Coqui_TTS_MultiSpeaker_jia_et_al_2018.ipynb b/notebooks/Demo_Coqui_TTS_MultiSpeaker_jia_et_al_2018.ipynb deleted file mode 100644 index 82efdc2a..00000000 --- a/notebooks/Demo_Coqui_TTS_MultiSpeaker_jia_et_al_2018.ipynb +++ /dev/null @@ -1,650 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "yZK6UdwSFnOO" - }, - "source": [ - "# **Download and install Coqui TTS**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "yvb0pX3WY6MN" - }, - "outputs": [], - "source": [ - "import os \n", - "!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "iB9nl2UEG3SY" - }, - "outputs": [], - "source": [ - "!apt-get install espeak\n", - "os.chdir('TTS')\n", - "!pip install -r requirements.txt\n", - "!python setup.py develop\n", - "os.chdir('..')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "w6Krn8k1inC_" - }, - "source": [ - "\n", - "\n", - "**Download Checkpoint**\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "PiYHf3lKhi9z" - }, - "outputs": [], - "source": [ - "!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018.zip\n", - "!unzip ./TTS-checkpoint.zip\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MpYNgqrZcJKn" - }, - "source": [ - "**Utils Functions**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "4KZA4b_CbMqx" - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import argparse\n", - "import json\n", - "# pylint: disable=redefined-outer-name, unused-argument\n", - "import os\n", - "import string\n", - "import time\n", - "import sys\n", - "import numpy as np\n", - "\n", - "TTS_PATH = \"../content/TTS\"\n", - "# add libraries into environment\n", - "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", - "\n", - "import torch\n", - "\n", - "from TTS.tts.utils.generic_utils import setup_model\n", - "from TTS.tts.utils.synthesis import synthesis\n", - "from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.io import load_config\n", - "from TTS.vocoder.utils.generic_utils import setup_generator\n", - "\n", - "\n", - "def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None):\n", - " t_1 = time.time()\n", - " waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, None, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n", - " if CONFIG.model == \"Tacotron\" and not use_gl:\n", - " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " if not use_gl:\n", - " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", - " if use_cuda and not use_gl:\n", - " waveform = waveform.cpu()\n", - " if not use_gl:\n", - " waveform = waveform.numpy()\n", - " waveform = waveform.squeeze()\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " tps = (time.time() - t_1) / len(waveform)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " print(\" > Time per step: {}\".format(tps))\n", - " return waveform\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ENA2OumIVeMA" - }, - "source": [ - "# **Vars definitions**\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "jPD0d_XpVXmY" - }, - "outputs": [], - "source": [ - "TEXT = ''\n", - "OUT_PATH = 'tests-audios/'\n", - "# create output path\n", - "os.makedirs(OUT_PATH, exist_ok=True)\n", - "\n", - "SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n", - "\n", - "# model vars \n", - "MODEL_PATH = 'best_model.pth.tar'\n", - "CONFIG_PATH = 'config.json'\n", - "SPEAKER_JSON = 'speakers.json'\n", - "\n", - "# vocoder vars\n", - "VOCODER_PATH = ''\n", - "VOCODER_CONFIG_PATH = ''\n", - "\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "dV6cXXlfi72r" - }, - "source": [ - "# **Restore TTS Model**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "x1WgLFauWUPe" - }, - "outputs": [], - "source": [ - "# load the config\n", - "C = load_config(CONFIG_PATH)\n", - "C.forward_attn_mask = True\n", - "\n", - "# load the audio processor\n", - "ap = AudioProcessor(**C.audio)\n", - "\n", - "# if the vocabulary was passed, replace the default\n", - "if 'characters' in C.keys():\n", - " symbols, phonemes = make_symbols(**C.characters)\n", - "\n", - "speaker_embedding = None\n", - "speaker_embedding_dim = None\n", - "num_speakers = 0\n", - "# load speakers\n", - "if SPEAKER_JSON != '':\n", - " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", - " num_speakers = len(speaker_mapping)\n", - " if C.use_external_speaker_embedding_file:\n", - " if SPEAKER_FILEID is not None:\n", - " speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n", - " else: # if speaker_fileid is not specificated use the first sample in speakers.json\n", - " choise_speaker = list(speaker_mapping.keys())[0]\n", - " print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n", - " speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n", - " speaker_embedding_dim = len(speaker_embedding)\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n", - "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", - "model.load_state_dict(cp['model'])\n", - "model.eval()\n", - "\n", - "if USE_CUDA:\n", - " model.cuda()\n", - "\n", - "model.decoder.set_r(cp['r'])\n", - "\n", - "# load vocoder model\n", - "if VOCODER_PATH!= \"\":\n", - " VC = load_config(VOCODER_CONFIG_PATH)\n", - " vocoder_model = setup_generator(VC)\n", - " vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n", - " vocoder_model.remove_weight_norm()\n", - " if USE_CUDA:\n", - " vocoder_model.cuda()\n", - " vocoder_model.eval()\n", - "else:\n", - " vocoder_model = None\n", - " VC = None\n", - "\n", - "# synthesize voice\n", - "use_griffin_lim = VOCODER_PATH== \"\"\n", - "\n", - "if not C.use_external_speaker_embedding_file:\n", - " if SPEAKER_FILEID.isdigit():\n", - " SPEAKER_FILEID = int(SPEAKER_FILEID)\n", - " else:\n", - " SPEAKER_FILEID = None\n", - "else:\n", - " SPEAKER_FILEID = None\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tNvVEoE30qY6" - }, - "source": [ - "Synthesize sentence with Speaker\n", - "\n", - "> Stop running the cell to leave!\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2o8fXkVSyXOa" - }, - "outputs": [], - "source": [ - "import IPython\n", - "from IPython.display import Audio\n", - "print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n", - "while True:\n", - " TEXT = input(\"Enter sentence: \")\n", - " print(\" > Text: {}\".format(TEXT))\n", - " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", - " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - " # save the results\n", - " file_name = TEXT.replace(\" \", \"_\")\n", - " file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - " out_path = os.path.join(OUT_PATH, file_name)\n", - " print(\" > Saving output to {}\".format(out_path))\n", - " ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "vnV-FigfvsS2" - }, - "source": [ - "# **Select Speaker**\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "RuCGOnJ_fgDV" - }, - "outputs": [], - "source": [ - "\n", - "# VCTK speakers not seen in training (new speakers)\n", - "VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n", - "\n", - "# VCTK speakers seen in training\n", - "VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n", - "\n", - "\n", - "num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hkvv7gRcx4WV" - }, - "source": [ - "## **Example select a VCTK seen speaker in training**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "BviNMI9UyCYz" - }, - "outputs": [], - "source": [ - "# get embedding\n", - "Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n", - "# load speakers\n", - "if SPEAKER_JSON != '':\n", - " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", - " if C.use_external_speaker_embedding_file:\n", - " speaker_embeddings = []\n", - " for key in list(speaker_mapping.keys()):\n", - " if Speaker_choise in key:\n", - " if len(speaker_embeddings) < num_samples_speaker:\n", - " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", - " # takes the average of the embedings samples of the announcers\n", - " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "5e5_XnLsx3jg" - }, - "outputs": [], - "source": [ - "import IPython\n", - "from IPython.display import Audio\n", - "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n", - "while True:\n", - " TEXT = input(\"Enter sentence: \")\n", - " print(\" > Text: {}\".format(TEXT))\n", - " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", - " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - " # save the results\n", - " file_name = TEXT.replace(\" \", \"_\")\n", - " file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - " out_path = os.path.join(OUT_PATH, file_name)\n", - " print(\" > Saving output to {}\".format(out_path))\n", - " ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "QJ6VgT2a4vHW" - }, - "source": [ - "## **Example select a VCTK not seen speaker in training (new Speakers)**\n", - "\n", - "\n", - "> Fitting new Speakers :)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "SZS57ZK-4vHa" - }, - "outputs": [], - "source": [ - "# get embedding\n", - "Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n", - "# load speakers\n", - "if SPEAKER_JSON != '':\n", - " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", - " if C.use_external_speaker_embedding_file:\n", - " speaker_embeddings = []\n", - " for key in list(speaker_mapping.keys()):\n", - " if Speaker_choise in key:\n", - " if len(speaker_embeddings) < num_samples_speaker:\n", - " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", - " # takes the average of the embedings samples of the announcers\n", - " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "bbs85vzz4vHo" - }, - "outputs": [], - "source": [ - "import IPython\n", - "from IPython.display import Audio\n", - "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", - "while True:\n", - " TEXT = input(\"Enter sentence: \")\n", - " print(\" > Text: {}\".format(TEXT))\n", - " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", - " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - " # save the results\n", - " file_name = TEXT.replace(\" \", \"_\")\n", - " file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - " out_path = os.path.join(OUT_PATH, file_name)\n", - " print(\" > Saving output to {}\".format(out_path))\n", - " ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "LEE6mQLh5Who" - }, - "source": [ - "# **Example Synthesizing with your own voice :)**\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "La70gSB65nrs" - }, - "source": [ - " Download and load GE2E Speaker Encoder " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "r0IEFZ0B5vQg" - }, - "outputs": [], - "source": [ - "!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n", - "!unzip ./SpeakerEncoder-checkpoint.zip" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "jEH8HCTh5mF6" - }, - "outputs": [], - "source": [ - "SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n", - "SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n", - "SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "tOwkfQqT6-Qo" - }, - "outputs": [], - "source": [ - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.speaker_encoder.model import SpeakerEncoder\n", - "se_config = load_config(SE_CONFIG_PATH)\n", - "se_ap = AudioProcessor(**se_config['audio'])\n", - "\n", - "se_model = SpeakerEncoder(**se_config.model)\n", - "se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n", - "se_model.eval()\n", - "if USE_CUDA:\n", - " se_model.cuda()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "0TLlbUFG8O36" - }, - "source": [ - "Upload a wav audio file in your voice.\n", - "\n", - "\n", - "> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_FWwHPjJ8NXl" - }, - "outputs": [], - "source": [ - "from google.colab import files\n", - "file_list = files.upload()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "WWOf6sgbBbGY" - }, - "outputs": [], - "source": [ - "# extract embedding from wav files\n", - "speaker_embeddings = []\n", - "for name in file_list.keys():\n", - " if '.wav' in name:\n", - " mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n", - " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", - " if USE_CUDA:\n", - " mel_spec = mel_spec.cuda()\n", - " embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", - " speaker_embeddings.append(embedd)\n", - " else:\n", - " print(\" You need upload Wav files, others files is not supported !!\")\n", - "\n", - "# takes the average of the embedings samples of the announcers\n", - "speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "xmItcGac5WiG" - }, - "outputs": [], - "source": [ - "import IPython\n", - "from IPython.display import Audio\n", - "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", - "while True:\n", - " TEXT = input(\"Enter sentence: \")\n", - " print(\" > Text: {}\".format(TEXT))\n", - " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding)\n", - " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - " # save the results\n", - " file_name = TEXT.replace(\" \", \"_\")\n", - " file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - " out_path = os.path.join(OUT_PATH, file_name)\n", - " print(\" > Saving output to {}\".format(out_path))\n", - " ap.save_wav(wav, out_path)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [ - "vnV-FigfvsS2", - "hkvv7gRcx4WV", - "QJ6VgT2a4vHW" - ], - "name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/Demo_Coqui_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb b/notebooks/Demo_Coqui_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb deleted file mode 100644 index f65d09a6..00000000 --- a/notebooks/Demo_Coqui_TTS_MultiSpeaker_jia_et_al_2018_With_GST.ipynb +++ /dev/null @@ -1,847 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "yZK6UdwSFnOO" - }, - "source": [ - "# **Download and install Coqui TTS**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "yvb0pX3WY6MN" - }, - "outputs": [], - "source": [ - "import os \n", - "!git clone https://github.com/Edresson/TTS -b dev-gst-embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "iB9nl2UEG3SY" - }, - "outputs": [], - "source": [ - "!apt-get install espeak\n", - "os.chdir('TTS')\n", - "!pip install -r requirements.txt\n", - "!python setup.py develop\n", - "os.chdir('..')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "w6Krn8k1inC_" - }, - "source": [ - "\n", - "\n", - "**Download Checkpoint**\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "PiYHf3lKhi9z" - }, - "outputs": [], - "source": [ - "!wget -c -q --show-progress -O ./TTS-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/Checkpoints-TTS-MultiSpeaker-Jia-et-al-2018-with-GST.zip\n", - "!unzip ./TTS-checkpoint.zip\n", - "\n", - "# Download gst style example\n", - "!wget https://github.com/Edresson/TTS/releases/download/v1.0.0/gst-style-example.wav" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MpYNgqrZcJKn" - }, - "source": [ - "**Utils Functions**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "4KZA4b_CbMqx" - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import argparse\n", - "import json\n", - "# pylint: disable=redefined-outer-name, unused-argument\n", - "import os\n", - "import string\n", - "import time\n", - "import sys\n", - "import numpy as np\n", - "\n", - "TTS_PATH = \"../content/TTS\"\n", - "# add libraries into environment\n", - "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", - "\n", - "import torch\n", - "\n", - "from TTS.tts.utils.generic_utils import setup_model\n", - "from TTS.tts.utils.synthesis import synthesis\n", - "from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.io import load_config\n", - "from TTS.vocoder.utils.generic_utils import setup_generator\n", - "\n", - "\n", - "def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_fileid, speaker_embedding=None, gst_style=None):\n", - " t_1 = time.time()\n", - " waveform, _, _, mel_postnet_spec, _, _ = synthesis(model, text, CONFIG, use_cuda, ap, speaker_fileid, gst_style, False, CONFIG.enable_eos_bos_chars, use_gl, speaker_embedding=speaker_embedding)\n", - " if CONFIG.model == \"Tacotron\" and not use_gl:\n", - " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " if not use_gl:\n", - " waveform = vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))\n", - " if use_cuda and not use_gl:\n", - " waveform = waveform.cpu()\n", - " if not use_gl:\n", - " waveform = waveform.numpy()\n", - " waveform = waveform.squeeze()\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " tps = (time.time() - t_1) / len(waveform)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " print(\" > Time per step: {}\".format(tps))\n", - " return waveform\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "ENA2OumIVeMA" - }, - "source": [ - "# **Vars definitions**\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "jPD0d_XpVXmY" - }, - "outputs": [], - "source": [ - "TEXT = ''\n", - "OUT_PATH = 'tests-audios/'\n", - "# create output path\n", - "os.makedirs(OUT_PATH, exist_ok=True)\n", - "\n", - "SPEAKER_FILEID = None # if None use the first embedding from speakers.json\n", - "\n", - "# model vars \n", - "MODEL_PATH = 'best_model.pth.tar'\n", - "CONFIG_PATH = 'config.json'\n", - "SPEAKER_JSON = 'speakers.json'\n", - "\n", - "# vocoder vars\n", - "VOCODER_PATH = ''\n", - "VOCODER_CONFIG_PATH = ''\n", - "\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "dV6cXXlfi72r" - }, - "source": [ - "# **Restore TTS Model**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "x1WgLFauWUPe" - }, - "outputs": [], - "source": [ - "# load the config\n", - "C = load_config(CONFIG_PATH)\n", - "C.forward_attn_mask = True\n", - "\n", - "# load the audio processor\n", - "ap = AudioProcessor(**C.audio)\n", - "\n", - "# if the vocabulary was passed, replace the default\n", - "if 'characters' in C.keys():\n", - " symbols, phonemes = make_symbols(**C.characters)\n", - "\n", - "speaker_embedding = None\n", - "speaker_embedding_dim = None\n", - "num_speakers = 0\n", - "# load speakers\n", - "if SPEAKER_JSON != '':\n", - " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", - " num_speakers = len(speaker_mapping)\n", - " if C.use_external_speaker_embedding_file:\n", - " if SPEAKER_FILEID is not None:\n", - " speaker_embedding = speaker_mapping[SPEAKER_FILEID]['embedding']\n", - " else: # if speaker_fileid is not specificated use the first sample in speakers.json\n", - " choise_speaker = list(speaker_mapping.keys())[0]\n", - " print(\" Speaker: \",choise_speaker.split('_')[0],'was chosen automatically', \"(this speaker seen in training)\")\n", - " speaker_embedding = speaker_mapping[choise_speaker]['embedding']\n", - " speaker_embedding_dim = len(speaker_embedding)\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if C.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, num_speakers, C, speaker_embedding_dim)\n", - "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", - "model.load_state_dict(cp['model'])\n", - "model.eval()\n", - "\n", - "if USE_CUDA:\n", - " model.cuda()\n", - "\n", - "model.decoder.set_r(cp['r'])\n", - "\n", - "# load vocoder model\n", - "if VOCODER_PATH!= \"\":\n", - " VC = load_config(VOCODER_CONFIG_PATH)\n", - " vocoder_model = setup_generator(VC)\n", - " vocoder_model.load_state_dict(torch.load(VOCODER_PATH, map_location=\"cpu\")[\"model\"])\n", - " vocoder_model.remove_weight_norm()\n", - " if USE_CUDA:\n", - " vocoder_model.cuda()\n", - " vocoder_model.eval()\n", - "else:\n", - " vocoder_model = None\n", - " VC = None\n", - "\n", - "# synthesize voice\n", - "use_griffin_lim = VOCODER_PATH== \"\"\n", - "\n", - "if not C.use_external_speaker_embedding_file:\n", - " if SPEAKER_FILEID.isdigit():\n", - " SPEAKER_FILEID = int(SPEAKER_FILEID)\n", - " else:\n", - " SPEAKER_FILEID = None\n", - "else:\n", - " SPEAKER_FILEID = None\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "tNvVEoE30qY6" - }, - "source": [ - "Synthesize sentence with Speaker\n", - "\n", - "> Stop running the cell to leave!\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "2o8fXkVSyXOa" - }, - "outputs": [], - "source": [ - "import IPython\n", - "from IPython.display import Audio\n", - "print(\"Synthesize sentence with Speaker: \",choise_speaker.split('_')[0], \"(this speaker seen in training)\")\n", - "gst_style = 'gst-style-example.wav'\n", - "while True:\n", - " TEXT = input(\"Enter sentence: \")\n", - " print(\" > Text: {}\".format(TEXT))\n", - " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", - " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - " # save the results\n", - " file_name = TEXT.replace(\" \", \"_\")\n", - " file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - " out_path = os.path.join(OUT_PATH, file_name)\n", - " print(\" > Saving output to {}\".format(out_path))\n", - " ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "vnV-FigfvsS2" - }, - "source": [ - "# **Select Speaker**\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "RuCGOnJ_fgDV" - }, - "outputs": [], - "source": [ - "\n", - "# VCTK speakers not seen in training (new speakers)\n", - "VCTK_test_Speakers = [\"p225\", \"p234\", \"p238\", \"p245\", \"p248\", \"p261\", \"p294\", \"p302\", \"p326\", \"p335\", \"p347\"]\n", - "\n", - "# VCTK speakers seen in training\n", - "VCTK_train_Speakers = ['p244', 'p300', 'p303', 'p273', 'p292', 'p252', 'p254', 'p269', 'p345', 'p274', 'p363', 'p285', 'p351', 'p361', 'p295', 'p266', 'p307', 'p230', 'p339', 'p253', 'p310', 'p241', 'p256', 'p323', 'p237', 'p229', 'p298', 'p336', 'p276', 'p305', 'p255', 'p278', 'p299', 'p265', 'p267', 'p280', 'p260', 'p272', 'p262', 'p334', 'p283', 'p247', 'p246', 'p374', 'p297', 'p249', 'p250', 'p304', 'p240', 'p236', 'p312', 'p286', 'p263', 'p258', 'p313', 'p376', 'p279', 'p340', 'p362', 'p284', 'p231', 'p308', 'p277', 'p275', 'p333', 'p314', 'p330', 'p264', 'p226', 'p288', 'p343', 'p239', 'p232', 'p268', 'p270', 'p329', 'p227', 'p271', 'p228', 'p311', 'p301', 'p293', 'p364', 'p251', 'p317', 'p360', 'p281', 'p243', 'p287', 'p233', 'p259', 'p316', 'p257', 'p282', 'p306', 'p341', 'p318']\n", - "\n", - "\n", - "num_samples_speaker = 2 # In theory the more samples of the speaker the more similar to the real voice it will be!\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "hkvv7gRcx4WV" - }, - "source": [ - "## **Example select a VCTK seen speaker in training**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "BviNMI9UyCYz" - }, - "outputs": [], - "source": [ - "# get embedding\n", - "Speaker_choise = VCTK_train_Speakers[0] # choise one of training speakers\n", - "# load speakers\n", - "if SPEAKER_JSON != '':\n", - " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", - " if C.use_external_speaker_embedding_file:\n", - " speaker_embeddings = []\n", - " for key in list(speaker_mapping.keys()):\n", - " if Speaker_choise in key:\n", - " if len(speaker_embeddings) < num_samples_speaker:\n", - " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", - " # takes the average of the embedings samples of the announcers\n", - " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "5e5_XnLsx3jg" - }, - "outputs": [], - "source": [ - "import IPython\n", - "from IPython.display import Audio\n", - "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker seen in training)\")\n", - "gst_style = 'gst-style-example.wav'\n", - "while True:\n", - " TEXT = input(\"Enter sentence: \")\n", - " print(\" > Text: {}\".format(TEXT))\n", - " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", - " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - " # save the results\n", - " file_name = TEXT.replace(\" \", \"_\")\n", - " file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - " out_path = os.path.join(OUT_PATH, file_name)\n", - " print(\" > Saving output to {}\".format(out_path))\n", - " ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "QJ6VgT2a4vHW" - }, - "source": [ - "## **Example select a VCTK not seen speaker in training (new Speakers)**\n", - "\n", - "\n", - "> Fitting new Speakers :)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "SZS57ZK-4vHa" - }, - "outputs": [], - "source": [ - "# get embedding\n", - "Speaker_choise = VCTK_test_Speakers[0] # choise one of training speakers\n", - "# load speakers\n", - "if SPEAKER_JSON != '':\n", - " speaker_mapping = json.load(open(SPEAKER_JSON, 'r'))\n", - " if C.use_external_speaker_embedding_file:\n", - " speaker_embeddings = []\n", - " for key in list(speaker_mapping.keys()):\n", - " if Speaker_choise in key:\n", - " if len(speaker_embeddings) < num_samples_speaker:\n", - " speaker_embeddings.append(speaker_mapping[key]['embedding'])\n", - " # takes the average of the embedings samples of the announcers\n", - " speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "bbs85vzz4vHo" - }, - "outputs": [], - "source": [ - "import IPython\n", - "from IPython.display import Audio\n", - "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", - "gst_style = 'gst-style-example.wav'\n", - "while True:\n", - " TEXT = input(\"Enter sentence: \")\n", - " print(\" > Text: {}\".format(TEXT))\n", - " wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", - " IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - " # save the results\n", - " file_name = TEXT.replace(\" \", \"_\")\n", - " file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - " out_path = os.path.join(OUT_PATH, file_name)\n", - " print(\" > Saving output to {}\".format(out_path))\n", - " ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "g_G_HweN04W-" - }, - "source": [ - "# **Changing GST tokens manually (without wav reference)**" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "jyFP5syW2bjt" - }, - "source": [ - "You can define tokens manually, this way you can increase/decrease the function of a given GST token. For example a token is responsible for the length of the speaker's pauses, if you increase the value of that token you will have longer pauses and if you decrease it you will have shorter pauses." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "SpwjDjCM2a3Y" - }, - "outputs": [], - "source": [ - "# set gst tokens, in this model we have 5 tokens\n", - "gst_style = {\"0\": 0, \"1\": 0, \"3\": 0, \"4\": 0}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "qWChMbI_0z5X" - }, - "outputs": [], - "source": [ - "import IPython\n", - "from IPython.display import Audio\n", - "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", - "TEXT = input(\"Enter sentence: \")\n", - "print(\" > Text: {}\".format(TEXT))\n", - "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", - "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - "# save the results\n", - "file_name = TEXT.replace(\" \", \"_\")\n", - "file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - "out_path = os.path.join(OUT_PATH, file_name)\n", - "print(\" > Saving output to {}\".format(out_path))\n", - "ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "uFjUi9xQ3mG3" - }, - "outputs": [], - "source": [ - "gst_style = {\"0\": 0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n", - "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", - "TEXT = input(\"Enter sentence: \")\n", - "print(\" > Text: {}\".format(TEXT))\n", - "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", - "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - "# save the results\n", - "file_name = TEXT.replace(\" \", \"_\")\n", - "file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - "out_path = os.path.join(OUT_PATH, file_name)\n", - "print(\" > Saving output to {}\".format(out_path))\n", - "ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "Uw0d6gWg4L27" - }, - "outputs": [], - "source": [ - "gst_style = {\"0\": -0.9, \"1\": 0, \"3\": 0, \"4\": 0}\n", - "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", - "TEXT = input(\"Enter sentence: \")\n", - "print(\" > Text: {}\".format(TEXT))\n", - "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", - "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - "# save the results\n", - "file_name = TEXT.replace(\" \", \"_\")\n", - "file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - "out_path = os.path.join(OUT_PATH, file_name)\n", - "print(\" > Saving output to {}\".format(out_path))\n", - "ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "V9izw4-54-Tl" - }, - "outputs": [], - "source": [ - "gst_style = {\"0\": 0, \"1\": 0.9, \"3\": 0, \"4\": 0}\n", - "print(\"Synthesize sentence with Speaker: \",Speaker_choise.split('_')[0], \"(this speaker not seen in training (new speaker))\")\n", - "TEXT = input(\"Enter sentence: \")\n", - "print(\" > Text: {}\".format(TEXT))\n", - "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", - "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - "# save the results\n", - "file_name = TEXT.replace(\" \", \"_\")\n", - "file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - "out_path = os.path.join(OUT_PATH, file_name)\n", - "print(\" > Saving output to {}\".format(out_path))\n", - "ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "LEE6mQLh5Who" - }, - "source": [ - "# **Example Synthesizing with your own voice :)**\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "La70gSB65nrs" - }, - "source": [ - " Download and load GE2E Speaker Encoder " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "r0IEFZ0B5vQg" - }, - "outputs": [], - "source": [ - "!wget -c -q --show-progress -O ./SpeakerEncoder-checkpoint.zip https://github.com/Edresson/TTS/releases/download/v1.0.0/GE2E-SpeakerEncoder-iter25k.zip\n", - "!unzip ./SpeakerEncoder-checkpoint.zip" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "jEH8HCTh5mF6" - }, - "outputs": [], - "source": [ - "SE_MODEL_RUN_PATH = \"GE2E-SpeakerEncoder/\"\n", - "SE_MODEL_PATH = os.path.join(SE_MODEL_RUN_PATH, \"best_model.pth.tar\")\n", - "SE_CONFIG_PATH =os.path.join(SE_MODEL_RUN_PATH, \"config.json\")\n", - "USE_CUDA = True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "tOwkfQqT6-Qo" - }, - "outputs": [], - "source": [ - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.speaker_encoder.model import SpeakerEncoder\n", - "se_config = load_config(SE_CONFIG_PATH)\n", - "se_ap = AudioProcessor(**se_config['audio'])\n", - "\n", - "se_model = SpeakerEncoder(**se_config.model)\n", - "se_model.load_state_dict(torch.load(SE_MODEL_PATH)['model'])\n", - "se_model.eval()\n", - "if USE_CUDA:\n", - " se_model.cuda()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "0TLlbUFG8O36" - }, - "source": [ - "Upload one or more wav audio files in your voice.\n", - "\n", - "\n", - "> We recommend files longer than 3 seconds, the bigger the file the closer to your voice :)\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "_FWwHPjJ8NXl" - }, - "outputs": [], - "source": [ - "# select one or more wav files\n", - "from google.colab import files\n", - "file_list = files.upload()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "WWOf6sgbBbGY" - }, - "outputs": [], - "source": [ - "# extract embedding from wav files\n", - "speaker_embeddings = []\n", - "for name in file_list.keys():\n", - " if '.wav' in name:\n", - " mel_spec = se_ap.melspectrogram(se_ap.load_wav(name, sr=se_ap.sample_rate)).T\n", - " mel_spec = torch.FloatTensor(mel_spec[None, :, :])\n", - " if USE_CUDA:\n", - " mel_spec = mel_spec.cuda()\n", - " embedd = se_model.compute_embedding(mel_spec).cpu().detach().numpy().reshape(-1)\n", - " speaker_embeddings.append(embedd)\n", - " else:\n", - " print(\"You need upload Wav files, others files is not supported !!\")\n", - "\n", - "# takes the average of the embedings samples of the announcers\n", - "speaker_embedding = np.mean(np.array(speaker_embeddings), axis=0).tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "AQ7eP31d9yzq" - }, - "outputs": [], - "source": [ - "import IPython\n", - "from IPython.display import Audio\n", - "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", - "gst_style = {\"0\": 0, \"1\": 0.0, \"3\": 0, \"4\": 0}\n", - "gst_style = 'gst-style-example.wav'\n", - "TEXT = input(\"Enter sentence: \")\n", - "print(\" > Text: {}\".format(TEXT))\n", - "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", - "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - "# save the results\n", - "file_name = TEXT.replace(\" \", \"_\")\n", - "file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - "out_path = os.path.join(OUT_PATH, file_name)\n", - "print(\" > Saving output to {}\".format(out_path))\n", - "ap.save_wav(wav, out_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "11i10yE1-LMJ" - }, - "source": [ - "Uploading your own GST reference wav file" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "eKohSQG1-KkT" - }, - "outputs": [], - "source": [ - "# select one wav file for GST reference\n", - "from google.colab import files\n", - "file_list = files.upload()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "xmItcGac5WiG" - }, - "outputs": [], - "source": [ - "print(\"Synthesize sentence with New Speaker using files: \",file_list.keys(), \"(this speaker not seen in training (new speaker))\")\n", - "gst_style = list(file_list.keys())[0]\n", - "TEXT = input(\"Enter sentence: \")\n", - "print(\" > Text: {}\".format(TEXT))\n", - "wav = tts(model, vocoder_model, TEXT, C, USE_CUDA, ap, use_griffin_lim, SPEAKER_FILEID, speaker_embedding=speaker_embedding, gst_style=gst_style)\n", - "IPython.display.display(Audio(wav, rate=ap.sample_rate))\n", - "# save the results\n", - "file_name = TEXT.replace(\" \", \"_\")\n", - "file_name = file_name.translate(\n", - " str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav'\n", - "out_path = os.path.join(OUT_PATH, file_name)\n", - "print(\" > Saving output to {}\".format(out_path))\n", - "ap.save_wav(wav, out_path)" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [ - "yZK6UdwSFnOO", - "ENA2OumIVeMA", - "dV6cXXlfi72r", - "vnV-FigfvsS2", - "g_G_HweN04W-", - "LEE6mQLh5Who" - ], - "name": "Demo-Mozilla-TTS-MultiSpeaker-jia-et-al-2018-With-GST.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/requirements.txt b/requirements.txt index fafd5112..c6ce7672 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,6 @@ librosa==0.8.0 matplotlib numpy==1.18.5 pandas -phonemizer>=2.2.0 pypinyin pysbd pyyaml