diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb deleted file mode 100644 index 082ffa60..00000000 --- a/notebooks/Benchmark-PWGAN.ipynb +++ /dev/null @@ -1,585 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is to test TTS models with benchmark sentences for speech synthesis.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related model files from TTS and PWGAN.\n", - "- download or clone related repos, linked below.\n", - "- setup the repositories. ```python setup.py install```\n", - "- to checkout right commit versions (given next to the model) of TTS and PWGAN.\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repositories:\n", - "- TTS: https://github.com/mozilla/TTS\n", - "- PWGAN: https://github.com/erogol/ParallelWaveGAN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "import sys\n", - "import io\n", - "import torch \n", - "import time\n", - "import json\n", - "import yaml\n", - "import numpy as np\n", - "from collections import OrderedDict\n", - "import matplotlib.pyplot as plt\n", - "plt.rcParams[\"figure.figsize\"] = (16,5)\n", - "\n", - "import librosa\n", - "import librosa.display\n", - "\n", - "from TTS.models.tacotron import Tacotron \n", - "from TTS.layers import *\n", - "from TTS.utils.data import *\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.generic_utils import load_config, setup_model\n", - "from TTS.utils.text import text_to_sequence\n", - "from TTS.utils.synthesis import synthesis\n", - "from TTS.utils.visual import visualize\n", - "\n", - "import IPython\n", - "from IPython.display import Audio\n", - "\n", - "import os\n", - "\n", - "# you may need to change this depending on your system\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'\n", - "%matplotlib inline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n", - " if CONFIG.model == \"Tacotron\" and not use_gl:\n", - " # coorect the normalization differences b/w TTS and the Vocoder.\n", - " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", - "# mel_postnet_spec = np.pad(mel_postnet_spec, pad_width=((2, 2), (0, 0)))\n", - " print(mel_postnet_spec.shape)\n", - " print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n", - " if not use_gl:\n", - " waveform = vocoder_model.inference(torch.FloatTensor(ap_vocoder._normalize(mel_postnet_spec).T).unsqueeze(0), hop_size=ap_vocoder.hop_length)\n", - "# waveform = waveform / abs(waveform).max() * 0.9\n", - " if use_cuda:\n", - " waveform = waveform.cpu()\n", - " waveform = waveform.numpy()\n", - " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", - " print(waveform.shape)\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " print(\" > Real-time factor: {}\".format(rtf))\n", - " if figures: \n", - " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec)) \n", - " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False)) \n", - " os.makedirs(OUT_FOLDER, exist_ok=True)\n", - " file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", - " out_path = os.path.join(OUT_FOLDER, file_name)\n", - " ap.save_wav(waveform, out_path)\n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set constants\n", - "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n", - "MODEL_PATH = ROOT_PATH + '/checkpoint_670000.pth.tar'\n", - "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", - "CONFIG = load_config(CONFIG_PATH)\n", - "VOCODER_MODEL_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/checkpoint-400000steps.pkl\"\n", - "VOCODER_CONFIG_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/config.yml\"\n", - "\n", - "# load PWGAN config\n", - "with open(VOCODER_CONFIG_PATH) as f:\n", - " VOCODER_CONFIG = yaml.load(f, Loader=yaml.Loader)\n", - " \n", - "# Run FLAGs\n", - "use_cuda = False\n", - "# Set some config fields manually for testing\n", - "CONFIG.windowing = True\n", - "CONFIG.use_forward_attn = True \n", - "# Set the vocoder\n", - "use_gl = False # use GL if True\n", - "batched_wavernn = True # use batched wavernn inference if True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", - "\n", - "# multi speaker \n", - "if CONFIG.use_speaker_embedding:\n", - " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", - " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", - "else:\n", - " speakers = []\n", - " speaker_id = None\n", - "\n", - "# if the vocabulary was passed, replace the default\n", - "if 'characters' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.characters)\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, len(speakers), CONFIG)\n", - "\n", - "# load the audio processor\n", - "ap = AudioProcessor(**CONFIG.audio) \n", - "\n", - "\n", - "# load model state\n", - "cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n", - "\n", - "# load the model\n", - "model.load_state_dict(cp['model'])\n", - "if use_cuda:\n", - " model.cuda()\n", - "model.eval()\n", - "print(cp['step'])\n", - "print(cp['r'])\n", - "\n", - "# set model stepsize\n", - "if 'r' in cp:\n", - " model.decoder.set_r(cp['r'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# LOAD WAVERNN\n", - "if use_gl == False:\n", - " from parallel_wavegan.models import ParallelWaveGANGenerator\n", - " from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder\n", - " \n", - " vocoder_model = ParallelWaveGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n", - " vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n", - " vocoder_model.remove_weight_norm()\n", - " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG['audio']) \n", - " if use_cuda:\n", - " vocoder_model.cuda()\n", - " vocoder_model.eval();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparision with https://mycroft.ai/blog/available-voices/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.eval()\n", - "model.decoder.max_decoder_steps = 2000\n", - "model.decoder.prenet.eval()\n", - "speaker_id = None\n", - "sentence = '''A breeding jennet, lusty, young, and proud,'''\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### https://espnet.github.io/icassp2020-tts/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The Commission also recommends\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Other examples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The human voice is the most perfect instrument of all.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This cake is great. It's so delicious and moist.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparison with https://keithito.github.io/audio-samples/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \" He has read the whole thing.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"He reads books.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Thisss isrealy awhsome.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser, Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Eren, how are you?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hard Sentences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Encouraged, he started with a minute a day.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"If he decided to watch TV he really watched it.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for twb dataset\n", - "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/Benchmark.ipynb b/notebooks/Benchmark.ipynb deleted file mode 100644 index 7d3a45cf..00000000 --- a/notebooks/Benchmark.ipynb +++ /dev/null @@ -1,546 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is to test TTS models with benchmark sentences for speech synthesis.\n", - "\n", - "Before running this script please DON'T FORGET: \n", - "- to set file paths.\n", - "- to download related model files from TTS and WaveRNN.\n", - "- to checkout right commit versions (given next to the model) of TTS and WaveRNN.\n", - "- to set the right paths in the cell below.\n", - "\n", - "Repositories:\n", - "- TTS: https://github.com/mozilla/TTS\n", - "- WaveRNN: https://github.com/erogol/WaveRNN" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TTS_PATH = \"/home/erogol/projects/\"\n", - "WAVERNN_PATH =\"/home/erogol/projects/\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "import os\n", - "import sys\n", - "import io\n", - "import torch \n", - "import time\n", - "import json\n", - "import numpy as np\n", - "from collections import OrderedDict\n", - "from matplotlib import pylab as plt\n", - "\n", - "%pylab inline\n", - "rcParams[\"figure.figsize\"] = (16,5)\n", - "\n", - "# add libraries into environment\n", - "sys.path.append(TTS_PATH) # set this if TTS is not installed globally\n", - "sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally\n", - "\n", - "import librosa\n", - "import librosa.display\n", - "\n", - "from TTS.models.tacotron import Tacotron \n", - "from TTS.layers import *\n", - "from TTS.utils.data import *\n", - "from TTS.utils.audio import AudioProcessor\n", - "from TTS.utils.generic_utils import load_config, setup_model\n", - "from TTS.utils.text import text_to_sequence\n", - "from TTS.utils.synthesis import synthesis\n", - "from TTS.utils.visual import visualize\n", - "from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n", - "\n", - "import IPython\n", - "from IPython.display import Audio\n", - "\n", - "import os\n", - "os.environ['CUDA_VISIBLE_DEVICES']='1'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n", - " t_1 = time.time()\n", - " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, \n", - " truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars,\n", - " use_griffin_lim=use_gl)\n", - " if CONFIG.model == \"Tacotron\" and not use_gl:\n", - " # coorect the normalization differences b/w TTS and the Vocoder.\n", - " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", - " if not use_gl:\n", - " mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n", - " mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)\n", - " waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)\n", - "\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", - " if figures: \n", - " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec) \n", - " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'])) \n", - " os.makedirs(OUT_FOLDER, exist_ok=True)\n", - " file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", - " out_path = os.path.join(OUT_FOLDER, file_name)\n", - " ap.save_wav(waveform, out_path)\n", - " return alignment, mel_postnet_spec, stop_tokens, waveform" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Set constants\n", - "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n", - "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", - "CONFIG_PATH = ROOT_PATH + '/config.json'\n", - "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", - "CONFIG = load_config(CONFIG_PATH)\n", - "VOCODER_MODEL_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar\"\n", - "VOCODER_CONFIG_PATH = \"/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json\"\n", - "VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)\n", - "use_cuda = True\n", - "\n", - "# Set some config fields manually for testing\n", - "# CONFIG.windowing = False\n", - "# CONFIG.prenet_dropout = False\n", - "# CONFIG.separate_stopnet = True\n", - "CONFIG.use_forward_attn = True\n", - "# CONFIG.forward_attn_mask = True\n", - "# CONFIG.stopnet = True\n", - "\n", - "# Set the vocoder\n", - "use_gl = True # use GL if True\n", - "batched_wavernn = True # use batched wavernn inference if True" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# LOAD TTS MODEL\n", - "# multi speaker \n", - "if CONFIG.use_speaker_embedding:\n", - " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", - " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", - "else:\n", - " speakers = []\n", - " speaker_id = None\n", - "\n", - "# if the vocabulary was passed, replace the default\n", - "if 'characters' in CONFIG.keys():\n", - " symbols, phonemes = make_symbols(**CONFIG.characters)\n", - "\n", - "# load the model\n", - "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", - "model = setup_model(num_chars, len(speakers), CONFIG)\n", - "\n", - "# load the audio processor\n", - "ap = AudioProcessor(**CONFIG.audio) \n", - "\n", - "\n", - "# load model state\n", - "if use_cuda:\n", - " cp = torch.load(MODEL_PATH)\n", - "else:\n", - " cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n", - "\n", - "# load the model\n", - "model.load_state_dict(cp['model'])\n", - "if use_cuda:\n", - " model.cuda()\n", - "model.eval()\n", - "print(cp['step'])\n", - "print(cp['r'])\n", - "\n", - "# set model stepsize\n", - "if 'r' in cp:\n", - " model.decoder.set_r(cp['r'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# LOAD WAVERNN - Make sure you downloaded the model and installed the module\n", - "if use_gl == False:\n", - " from WaveRNN.models.wavernn import Model\n", - " from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder\n", - " bits = 10\n", - " ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio) \n", - " wavernn = Model(\n", - " rnn_dims=512,\n", - " fc_dims=512,\n", - " mode=VOCODER_CONFIG.mode,\n", - " mulaw=VOCODER_CONFIG.mulaw,\n", - " pad=VOCODER_CONFIG.pad,\n", - " upsample_factors=VOCODER_CONFIG.upsample_factors,\n", - " feat_dims=VOCODER_CONFIG.audio[\"num_mels\"],\n", - " compute_dims=128,\n", - " res_out_dims=128,\n", - " res_blocks=10,\n", - " hop_length=ap_vocoder.hop_length,\n", - " sample_rate=ap_vocoder.sample_rate,\n", - " use_upsample_net = True,\n", - " use_aux_net = True\n", - " ).cuda()\n", - "\n", - " check = torch.load(VOCODER_MODEL_PATH)\n", - " wavernn.load_state_dict(check['model'], strict=False)\n", - " if use_cuda:\n", - " wavernn.cuda()\n", - " wavernn.eval();\n", - " print(check['step'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparision with https://mycroft.ai/blog/available-voices/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.eval()\n", - "model.decoder.max_decoder_steps = 2000\n", - "speaker_id = None\n", - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.eval()\n", - "model.decoder.max_decoder_steps = 2000\n", - "sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The human voice is the most perfect instrument of all.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This cake is great. It's so delicious and moist.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparison with https://keithito.github.io/audio-samples/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Here’s a way to measure the acute emotional intelligence that has never gone out of style.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The buses aren't the problem, they actually provide a solution.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Generative adversarial network or variational auto-encoder.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \" He has read the whole thing.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"He reads books.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Thisss isrealy awhsome.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser, Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"This is your internet browser Firefox.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"The quick brown fox jumps over the lazy dog.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Does the quick brown fox jump over the lazy dog?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Eren, how are you?\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hard Sentences" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Encouraged, he started with a minute a day.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"If he decided to watch TV he really watched it.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# for twb dataset\n", - "sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n", - "align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/speaker_encoder/tests.py b/speaker_encoder/tests.py index 220ba360..039833fc 100644 --- a/speaker_encoder/tests.py +++ b/speaker_encoder/tests.py @@ -4,7 +4,7 @@ import torch as T from TTS.speaker_encoder.model import SpeakerEncoder from TTS.speaker_encoder.loss import GE2ELoss -from TTS.utils.generic_utils import load_config +from TTS.utils.io import load_config file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/" diff --git a/tf/notebooks/Benchmark-TTS_tf.ipynb b/tf/notebooks/Benchmark-TTS_tf.ipynb index 0464209d..4a21ae17 100644 --- a/tf/notebooks/Benchmark-TTS_tf.ipynb +++ b/tf/notebooks/Benchmark-TTS_tf.ipynb @@ -124,10 +124,10 @@ "OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n", "CONFIG = load_config(CONFIG_PATH)\n", "# Run FLAGs\n", - "use_cuda = True\n", + "use_cuda = True # use the available GPU (only for torch)\n", "# Set the vocoder\n", - "use_gl = True # use GL if True\n", - "BACKEND = 'tf'" + "use_gl = True # use GL if True\n", + "BACKEND = 'tf' # set the backend for inference " ] }, {