fix invalid json (#1599)

This commit is contained in:
Ryan Le-Nguyen 2022-05-31 18:20:10 +10:00 committed by GitHub
parent a790df4e94
commit b6bd74a9a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 187 additions and 190 deletions

View File

@ -1,191 +1,188 @@
{ {
"cells": [ "cells": [{
{ "cell_type": "markdown",
"cell_type": "markdown", "metadata": {
"metadata": { "Collapsed": "false"
"Collapsed": "false" },
}, "source": [
"source": [ "This notebook is to test attention performance of a TTS model on a list of sentences taken from DeepVoice paper.\n",
"This notebook is to test attention performance of a TTS model on a list of sentences taken from DeepVoice paper.\n", "### Features of this notebook\n",
"### Features of this notebook\n", "- You can see visually how your model performs on each sentence and try to dicern common problems.\n",
"- You can see visually how your model performs on each sentence and try to dicern common problems.\n", "- At the end, final attention score would be printed showing the ultimate performace of your model. You can use this value to perform model selection.\n",
"- At the end, final attention score would be printed showing the ultimate performace of your model. You can use this value to perform model selection.\n", "- You can change the list of sentences byt providing a different sentence file."
"- You can change the list of sentences byt providing a different sentence file." ]
] },
}, {
{ "cell_type": "code",
"cell_type": "code", "execution_count": null,
"execution_count": null, "metadata": {
"metadata": { "Collapsed": "false",
"Collapsed": "false", "scrolled": true
"scrolled": true },
}, "outputs": [],
"outputs": [], "source": [
"source": [ "%load_ext autoreload\n",
"%load_ext autoreload\n", "%autoreload 2\n",
"%autoreload 2\n", "import os, sys\n",
"import os, sys\n", "import torch \n",
"import torch \n", "import time\n",
"import time\n", "import numpy as np\n",
"import numpy as np\n", "from matplotlib import pylab as plt\n",
"from matplotlib import pylab as plt\n", "\n",
"\n", "%pylab inline\n",
"%pylab inline\n", "plt.rcParams[\"figure.figsize\"] = (16,5)\n",
"plt.rcParams[\"figure.figsize\"] = (16,5)\n", "\n",
"\n", "import librosa\n",
"import librosa\n", "import librosa.display\n",
"import librosa.display\n", "\n",
"\n", "from TTS.tts.layers import *\n",
"from TTS.tts.layers import *\n", "from TTS.utils.audio import AudioProcessor\n",
"from TTS.utils.audio import AudioProcessor "from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.tts.utils.io import load_config\n",
\n", "from TTS.tts.utils.text import text_to_sequence\n",
"from TTS.tts.utils.generic_utils import setup_model\n", "from TTS.tts.utils.synthesis import synthesis\n",
"from TTS.tts.utils.io import load_config\n", "from TTS.tts.utils.visual import plot_alignment\n",
"from TTS.tts.utils.text import text_to_sequence\n", "from TTS.tts.utils.measures import alignment_diagonal_score\n",
"from TTS.tts.utils.synthesis import synthesis\n", "\n",
"from TTS.tts.utils.visual import plot_alignment\n", "import IPython\n",
"from TTS.tts.utils.measures import alignment_diagonal_score\n", "from IPython.display import Audio\n",
"\n", "\n",
"import IPython\n", "os.environ['CUDA_VISIBLE_DEVICES']='1'\n",
"from IPython.display import Audio\n", "\n",
"\n", "def tts(model, text, CONFIG, use_cuda, ap):\n",
"os.environ['CUDA_VISIBLE_DEVICES']='1'\n", " t_1 = time.time()\n",
"\n", " # run the model\n",
"def tts(model, text, CONFIG, use_cuda, ap):\n", " waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)\n",
" t_1 = time.time()\n", " if CONFIG.model == \"Tacotron\" and not use_gl:\n",
" # run the model\n", " mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n",
" waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)\n", " # plotting\n",
" if CONFIG.model == \"Tacotron\" and not use_gl:\n", " attn_score = alignment_diagonal_score(torch.FloatTensor(alignment).unsqueeze(0))\n",
" mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T\n", " print(f\" > {text}\")\n",
" # plotting\n", " IPython.display.display(IPython.display.Audio(waveform, rate=ap.sample_rate))\n",
" attn_score = alignment_diagonal_score(torch.FloatTensor(alignment).unsqueeze(0))\n", " fig = plot_alignment(alignment, fig_size=(8, 5))\n",
" print(f\" > {text}\")\n", " IPython.display.display(fig)\n",
" IPython.display.display(IPython.display.Audio(waveform, rate=ap.sample_rate))\n", " #saving results\n",
" fig = plot_alignment(alignment, fig_size=(8, 5))\n", " os.makedirs(OUT_FOLDER, exist_ok=True)\n",
" IPython.display.display(fig)\n", " file_name = text[:200].replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n",
" #saving results\n", " out_path = os.path.join(OUT_FOLDER, file_name)\n",
" os.makedirs(OUT_FOLDER, exist_ok=True)\n", " ap.save_wav(waveform, out_path)\n",
" file_name = text[:200].replace(\" \", \"_\").replace(\".\",\"\") + \".wav\"\n", " return attn_score\n",
" out_path = os.path.join(OUT_FOLDER, file_name)\n", "\n",
" ap.save_wav(waveform, out_path)\n", "# Set constants\n",
" return attn_score\n", "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n",
"\n", "MODEL_PATH = ROOT_PATH + '/best_model.pth'\n",
"# Set constants\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
"ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n", "OUT_FOLDER = './hard_sentences/'\n",
"MODEL_PATH = ROOT_PATH + '/best_model.pth'\n", "CONFIG = load_config(CONFIG_PATH)\n",
"CONFIG_PATH = ROOT_PATH + '/config.json'\n", "SENTENCES_PATH = 'sentences.txt'\n",
"OUT_FOLDER = './hard_sentences/'\n", "use_cuda = True\n",
"CONFIG = load_config(CONFIG_PATH)\n", "\n",
"SENTENCES_PATH = 'sentences.txt'\n", "# Set some config fields manually for testing\n",
"use_cuda = True\n", "# CONFIG.windowing = False\n",
"\n", "# CONFIG.prenet_dropout = False\n",
"# Set some config fields manually for testing\n", "# CONFIG.separate_stopnet = True\n",
"# CONFIG.windowing = False\n", "CONFIG.use_forward_attn = False\n",
"# CONFIG.prenet_dropout = False\n", "# CONFIG.forward_attn_mask = True\n",
"# CONFIG.separate_stopnet = True\n", "# CONFIG.stopnet = True"
"CONFIG.use_forward_attn = False\n", ]
"# CONFIG.forward_attn_mask = True\n", },
"# CONFIG.stopnet = True" {
] "cell_type": "code",
}, "execution_count": null,
{ "metadata": {
"cell_type": "code", "Collapsed": "false"
"execution_count": null, },
"metadata": { "outputs": [],
"Collapsed": "false" "source": [
}, "# LOAD TTS MODEL\n",
"outputs": [], "from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n",
"source": [ "\n",
"# LOAD TTS MODEL\n", "# multi speaker \n",
"from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes\n", "if CONFIG.use_speaker_embedding:\n",
"\n", " speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n",
"# multi speaker \n", " speakers_idx_to_id = {v: k for k, v in speakers.items()}\n",
"if CONFIG.use_speaker_embedding:\n", "else:\n",
" speakers = json.load(open(f\"{ROOT_PATH}/speakers.json\", 'r'))\n", " speakers = []\n",
" speakers_idx_to_id = {v: k for k, v in speakers.items()}\n", " speaker_id = None\n",
"else:\n", "\n",
" speakers = []\n", "# if the vocabulary was passed, replace the default\n",
" speaker_id = None\n", "if 'characters' in CONFIG.keys():\n",
"\n", " symbols, phonemes = make_symbols(**CONFIG.characters)\n",
"# if the vocabulary was passed, replace the default\n", "\n",
"if 'characters' in CONFIG.keys():\n", "# load the model\n",
" symbols, phonemes = make_symbols(**CONFIG.characters)\n", "num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n",
"\n", "model = setup_model(num_chars, len(speakers), CONFIG)\n",
"# load the model\n", "\n",
"num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)\n", "# load the audio processor\n",
"model = setup_model(num_chars, len(speakers), CONFIG)\n", "ap = AudioProcessor(**CONFIG.audio) \n",
"\n", "\n",
"# load the audio processor\n", "\n",
"ap = AudioProcessor(**CONFIG.audio) \n", "# load model state\n",
"\n", "if use_cuda:\n",
"\n", " cp = torch.load(MODEL_PATH)\n",
"# load model state\n", "else:\n",
"if use_cuda:\n", " cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n",
" cp = torch.load(MODEL_PATH)\n", "\n",
"else:\n", "# load the model\n",
" cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)\n", "model.load_state_dict(cp['model'])\n",
"\n", "if use_cuda:\n",
"# load the model\n", " model.cuda()\n",
"model.load_state_dict(cp['model'])\n", "model.eval()\n",
"if use_cuda:\n", "print(cp['step'])\n",
" model.cuda()\n", "print(cp['r'])\n",
"model.eval()\n", "\n",
"print(cp['step'])\n", "# set model stepsize\n",
"print(cp['r'])\n", "if 'r' in cp:\n",
"\n", " model.decoder.set_r(cp['r'])"
"# set model stepsize\n", ]
"if 'r' in cp:\n", },
" model.decoder.set_r(cp['r'])" {
] "cell_type": "code",
}, "execution_count": null,
{ "metadata": {
"cell_type": "code", "Collapsed": "false"
"execution_count": null, },
"metadata": { "outputs": [],
"Collapsed": "false" "source": [
}, "model.decoder.max_decoder_steps=3000\n",
"outputs": [], "attn_scores = []\n",
"source": [ "with open(SENTENCES_PATH, 'r') as f:\n",
"model.decoder.max_decoder_steps=3000\n", " for text in f:\n",
"attn_scores = []\n", " attn_score = tts(model, text, CONFIG, use_cuda, ap)\n",
"with open(SENTENCES_PATH, 'r') as f:\n", " attn_scores.append(attn_score)"
" for text in f:\n", ]
" attn_score = tts(model, text, CONFIG, use_cuda, ap)\n", },
" attn_scores.append(attn_score)" {
] "cell_type": "code",
}, "execution_count": null,
{ "metadata": {
"cell_type": "code", "Collapsed": "false"
"execution_count": null, },
"metadata": { "outputs": [],
"Collapsed": "false" "source": [
}, "np.mean(attn_scores)"
"outputs": [], ]
"source": [ }
"np.mean(attn_scores)" ],
] "metadata": {
} "kernelspec": {
], "display_name": "Python 3",
"metadata": { "language": "python",
"kernelspec": { "name": "python3"
"display_name": "Python 3", },
"language": "python", "language_info": {
"name": "python3" "codemirror_mode": {
}, "name": "ipython",
"language_info": { "version": 3
"codemirror_mode": { },
"name": "ipython", "file_extension": ".py",
"version": 3 "mimetype": "text/x-python",
}, "name": "python",
"file_extension": ".py", "nbconvert_exporter": "python",
"mimetype": "text/x-python", "pygments_lexer": "ipython3",
"name": "python", "version": "3.8.5"
"nbconvert_exporter": "python", }
"pygments_lexer": "ipython3", },
"version": "3.8.5" "nbformat": 4,
} "nbformat_minor": 4
}, }
"nbformat": 4,
"nbformat_minor": 4
}