coqui-tts/notebooks/Benchmark-PWGAN.ipynb

586 lines
19 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is to test TTS models with benchmark sentences for speech synthesis.\n",
"\n",
"Before running this script please DON'T FORGET: \n",
"- to set file paths.\n",
"- to download related model files from TTS and PWGAN.\n",
"- download or clone related repos, linked below.\n",
"- setup the repositories. ```python setup.py install```\n",
"- to checkout right commit versions (given next to the model) of TTS and PWGAN.\n",
"- to set the right paths in the cell below.\n",
"\n",
"Repositories:\n",
"- TTS: https://github.com/mozilla/TTS\n",
"- PWGAN: https://github.com/erogol/ParallelWaveGAN"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
"import sys\n",
"import io\n",
"import torch \n",
"import time\n",
"import json\n",
"import yaml\n",
"import numpy as np\n",
"from collections import OrderedDict\n",
"import matplotlib.pyplot as plt\n",
"plt.rcParams[\"figure.figsize\"] = (16,5)\n",
"\n",
"import librosa\n",
"import librosa.display\n",
"\n",
"from TTS.models.tacotron import Tacotron \n",
"from TTS.layers import *\n",
"from TTS.utils.data import *\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.generic_utils import load_config, setup_model\n",
"from TTS.utils.text import text_to_sequence\n",
"from TTS.utils.synthesis import synthesis\n",
"from TTS.utils.visual import visualize\n",
"\n",
"import IPython\n",
"from IPython.display import Audio\n",
"\n",
"import os\n",
"\n",
"# you may need to change this depending on your system\n",
"os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):\n",
" t_1 = time.time()\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" # coorect the normalization differences b/w TTS and the Vocoder.\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" mel_postnet_spec = ap._denormalize(mel_postnet_spec)\n",
"# mel_postnet_spec = np.pad(mel_postnet_spec, pad_width=((2, 2), (0, 0)))\n",
" print(mel_postnet_spec.shape)\n",
" print(\"max- \", mel_postnet_spec.max(), \" -- min- \", mel_postnet_spec.min())\n",
" if not use_gl:\n",
" waveform = vocoder_model.inference(torch.FloatTensor(ap_vocoder._normalize(mel_postnet_spec).T).unsqueeze(0), hop_size=ap_vocoder.hop_length)\n",
"# waveform = waveform / abs(waveform).max() * 0.9\n",
" if use_cuda:\n",
" waveform = waveform.cpu()\n",
" waveform = waveform.numpy()\n",
" rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
" print(waveform.shape)\n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" print(\" > Real-time factor: {}\".format(rtf))\n",
" if figures: \n",
" visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec)) \n",
" IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False)) \n",
" os.makedirs(OUT_FOLDER, exist_ok=True)\n",
" file_name = text.replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
" out_path = os.path.join(OUT_FOLDER, file_name)\n",
" ap.save_wav(waveform, out_path)\n",
" return alignment, mel_postnet_spec, stop_tokens, waveform"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set constants\n",
"ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-bn-December-23-2019_08+34AM-ffea133/'\n",
"MODEL_PATH = ROOT_PATH + '/checkpoint_670000.pth.tar'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",
"VOCODER_MODEL_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/checkpoint-400000steps.pkl\"\n",
"VOCODER_CONFIG_PATH = \"/home/erogol/Models/LJSpeech/pwgan-ljspeech/config.yml\"\n",
"\n",
"# load PWGAN config\n",
"with open(VOCODER_CONFIG_PATH) as f:\n",
" VOCODER_CONFIG = yaml.load(f, Loader=yaml.Loader)\n",
" \n",
"# Run FLAGs\n",
"use_cuda = False\n",
"# Set some config fields manually for testing\n",
"CONFIG.windowing = True\n",
"CONFIG.use_forward_attn = True \n",
"# Set the vocoder\n",
"use_gl = False # use GL if True\n",
"batched_wavernn = True # use batched wavernn inference if True"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# LOAD TTS MODEL\n",
"from TTS.utils.text.symbols import make_symbols, symbols, phonemes\n",
"\n",
"# multi speaker \n",
"if CONFIG.use_speaker_embedding:\n",
" speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
" speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
"else:\n",
" speakers = []\n",
" speaker_id = None\n",
"\n",
"# if the vocabulary was passed, replace the default\n",
"if 'characters' in CONFIG.keys():\n",
" symbols, phonemes = make_symbols(**CONFIG.characters)\n",
"\n",
"# load the model\n",
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
"model = setup_model(num_chars, len(speakers), CONFIG)\n",
"\n",
"# load the audio processor\n",
"ap = AudioProcessor(**CONFIG.audio) \n",
"\n",
"\n",
"# load model state\n",
"cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))\n",
"\n",
"# load the model\n",
"model.load_state_dict(cp['model'])\n",
"if use_cuda:\n",
" model.cuda()\n",
"model.eval()\n",
"print(cp['step'])\n",
"print(cp['r'])\n",
"\n",
"# set model stepsize\n",
"if 'r' in cp:\n",
" model.decoder.set_r(cp['r'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# LOAD WAVERNN\n",
"if use_gl == False:\n",
" from parallel_wavegan.models import ParallelWaveGANGenerator\n",
" from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder\n",
" \n",
" vocoder_model = ParallelWaveGANGenerator(**VOCODER_CONFIG[\"generator_params\"])\n",
" vocoder_model.load_state_dict(torch.load(VOCODER_MODEL_PATH, map_location=\"cpu\")[\"model\"][\"generator\"])\n",
" vocoder_model.remove_weight_norm()\n",
" ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG['audio']) \n",
" if use_cuda:\n",
" vocoder_model.cuda()\n",
" vocoder_model.eval();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Comparision with https://mycroft.ai/blog/available-voices/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.eval()\n",
"model.decoder.max_decoder_steps = 2000\n",
"model.decoder.prenet.eval()\n",
"speaker_id = None\n",
"sentence = '''A breeding jennet, lusty, young, and proud,'''\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Bill got in the habit of asking himself “Is that thought true?” and if he wasnt absolutely certain it was, he just let it go.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### https://espnet.github.io/icassp2020-tts/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The Commission also recommends\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"As a result of these studies, the planning document submitted by the Secretary of the Treasury to the Bureau of the Budget on August thirty-one.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The FBI now transmits information on all defectors, a category which would, of course, have included Oswald.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"they seem unduly restrictive in continuing to require some manifestation of animus against a Government official.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"and each agency given clear understanding of the assistance which the Secret Service expects.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Other examples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Be a voice, not an echo.\" # 'echo' is not in training set. \n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The human voice is the most perfect instrument of all.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"I'm sorry Dave. I'm afraid I can't do that.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"This cake is great. It's so delicious and moist.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Comparison with https://keithito.github.io/audio-samples/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Scientists at the CERN laboratory say they have discovered a new particle.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Heres a way to measure the acute emotional intelligence that has never gone out of style.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"President Trump met with other leaders at the Group of 20 conference.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The buses aren't the problem, they actually provide a solution.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Comparison with https://google.github.io/tacotron/publications/tacotron/index.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Generative adversarial network or variational auto-encoder.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Basilar membrane and otolaryngology are not auto-correlations.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \" He has read the whole thing.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"He reads books.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Thisss isrealy awhsome.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"This is your internet browser, Firefox.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"This is your internet browser Firefox.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"The quick brown fox jumps over the lazy dog.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Does the quick brown fox jump over the lazy dog?\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Eren, how are you?\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Hard Sentences"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Encouraged, he started with a minute a day.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe .\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . \"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"If he decided to watch TV he really watched it.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sentence = \"Often we try to bring about change through sheer effort and we put all of our energy into a new initiative .\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# for twb dataset\n",
"sentence = \"In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion.\"\n",
"align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}