coqui-tts/notebooks/TacotronPlayGround.ipynb

185 lines
5.0 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"import os\n",
"import sys\n",
"import io\n",
"import torch \n",
"import time\n",
"import numpy as np\n",
"from collections import OrderedDict\n",
"\n",
"%pylab inline\n",
"rcParams[\"figure.figsize\"] = (16,5)\n",
"sys.path.append('/home/erogol/projects/')\n",
"\n",
"import librosa\n",
"import librosa.display\n",
"\n",
"from torchviz import make_dot, make_dot_from_trace\n",
"\n",
"from TTS.models.tacotron import Tacotron \n",
"from TTS.layers import *\n",
"from TTS.utils.data import *\n",
"from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.generic_utils import load_config\n",
"from TTS.utils.text import text_to_sequence\n",
"\n",
"import IPython\n",
"from IPython.display import Audio\n",
"from utils import *"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def tts(model, text, CONFIG, use_cuda, ap, figures=True):\n",
" t_1 = time.time()\n",
" waveform, alignment, spectrogram = create_speech(model, text, CONFIG, use_cuda, ap) \n",
" print(\" > Run-time: {}\".format(time.time() - t_1))\n",
" if figures: \n",
" visualize(alignment, spectrogram, CONFIG) \n",
" IPython.display.display(Audio(waveform, rate=CONFIG.sample_rate)) \n",
" return alignment, spectrogram"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Set constants\n",
"ROOT_PATH = '/data/shared/erogol_models/March-28-2018_06:24PM/'\n",
"MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"OUT_FOLDER = ROOT_PATH + '/test/'\n",
"CONFIG = load_config(CONFIG_PATH)\n",
"use_cuda = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load the model\n",
"model = Tacotron(CONFIG.embedding_size, CONFIG.num_mels, CONFIG.num_freq, CONFIG.r)\n",
"\n",
"# load the audio processor\n",
"ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,\n",
" CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,\n",
" CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=80) \n",
"\n",
"\n",
"# load model state\n",
"if use_cuda:\n",
" cp = torch.load(MODEL_PATH)\n",
"else:\n",
" cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n",
"\n",
"# # small trick to remove DataParallel wrapper\n",
"new_state_dict = OrderedDict()\n",
"for k, v in cp['model'].items():\n",
" name = k[7:] # remove `module.`\n",
" new_state_dict[name] = v\n",
"cp['model'] = new_state_dict\n",
"\n",
"# load the model\n",
"model.load_state_dict(cp['model'])\n",
"if use_cuda:\n",
" model.cuda()\n",
"model.eval()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### EXAMPLES FROM TRAINING SET"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('/data/shared/KeithIto/LJSpeech-1.0/metadata.csv', delimiter='|')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"sentence = df.iloc[120, 1].lower().replace(',','')\n",
"print(sentence)\n",
"align = tts(model, sentence, CONFIG, use_cuda, ap)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### NEW EXAMPLES"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"sentence = \"Will Donald Trump Jr. offer the countrys business leaders a peek into a new U.S.-India relationship in trade? Defense? Terrorism?\"\n",
"model.decoder.max_decoder_steps = 300\n",
"alignment = tts(model, sentence, CONFIG, use_cuda, ap)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}