coqui-tts/notebooks/TestAttention.ipynb

7.8 KiB

None <html lang="en"> <head> </head>

This notebook is to test attention performance of a TTS model on a list of sentences taken from DeepVoice paper.

Features of this notebook

  • You can see visually how your model performs on each sentence and try to dicern common problems.
  • At the end, final attention score would be printed showing the ultimate performace of your model. You can use this value to perform model selection.
  • You can change the list of sentences byt providing a different sentence file.
In [ ]:
%load_ext autoreload
%autoreload 2
import os, sys
import torch 
import time
import numpy as np
from matplotlib import pylab as plt

%pylab inline
plt.rcParams["figure.figsize"] = (16,5)

import librosa
import librosa.display

from TTS.tts.layers import *
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.io import load_config
from TTS.tts.utils.text import text_to_sequence
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.visual import plot_alignment
from TTS.tts.utils.measures import alignment_diagonal_score

import IPython
from IPython.display import Audio

os.environ['CUDA_VISIBLE_DEVICES']='1'

def tts(model, text, CONFIG, use_cuda, ap):
    t_1 = time.time()
    # run the model
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)
    if CONFIG.model == "Tacotron" and not use_gl:
        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
    # plotting
    attn_score = alignment_diagonal_score(torch.FloatTensor(alignment).unsqueeze(0))
    print(f" > {text}")
    IPython.display.display(IPython.display.Audio(waveform, rate=ap.sample_rate))
    fig = plot_alignment(alignment, fig_size=(8, 5))
    IPython.display.display(fig)
    #saving results
    os.makedirs(OUT_FOLDER, exist_ok=True)
    file_name = text[:200].replace(" ", "_").replace(".","") + ".wav"
    out_path = os.path.join(OUT_FOLDER, file_name)
    ap.save_wav(waveform, out_path)
    return attn_score

# Set constants
ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'
MODEL_PATH = ROOT_PATH + '/best_model.pth'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = './hard_sentences/'
CONFIG = load_config(CONFIG_PATH)
SENTENCES_PATH = 'sentences.txt'
use_cuda = True

# Set some config fields manually for testing
# CONFIG.windowing = False
# CONFIG.prenet_dropout = False
# CONFIG.separate_stopnet = True
CONFIG.use_forward_attn = False
# CONFIG.forward_attn_mask = True
# CONFIG.stopnet = True
In [ ]:
# LOAD TTS MODEL
from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes

# multi speaker 
if CONFIG.use_speaker_embedding:
    speakers = json.load(open(f"{ROOT_PATH}/speakers.json", 'r'))
    speakers_idx_to_id = {v: k for k, v in speakers.items()}
else:
    speakers = []
    speaker_id = None

# if the vocabulary was passed, replace the default
if 'characters' in CONFIG.keys():
    symbols, phonemes = make_symbols(**CONFIG.characters)

# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), CONFIG)

# load the audio processor
ap = AudioProcessor(**CONFIG.audio)         


# load model state
if use_cuda:
    cp = torch.load(MODEL_PATH)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()
print(cp['step'])
print(cp['r'])

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])
In [ ]:
model.decoder.max_decoder_steps=3000
attn_scores = []
with open(SENTENCES_PATH, 'r') as f:
    for text in f:
        attn_score = tts(model, text, CONFIG, use_cuda, ap)
        attn_scores.append(attn_score)
In [ ]:
np.mean(attn_scores)
</html>