coqui-tts/TestAttention.ipynb at ce202532cfe74e2e297e4109a80a3b125f54bd49

7.9 KiB

Raw Blame History

None <html lang="en"> <head> </head>

This notebook is to test attention performance of a TTS model on a list of sentences taken from DeepVoice paper.

Features of this notebook¶

You can see visually how your model performs on each sentence and try to dicern common problems.
At the end, final attention score would be printed showing the ultimate performace of your model. You can use this value to perform model selection.
You can change the list of sentences byt providing a different sentence file.

In [ ]:

%load_ext autoreload
%autoreload 2
import os, sys
import torch 
import time
import numpy as np
from matplotlib import pylab as plt

%pylab inline
plt.rcParams["figure.figsize"] = (16,5)

import librosa
import librosa.display

from TTS.tts.layers import *
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.io import load_config
from TTS.tts.utils.text import text_to_sequence
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.visual import plot_alignment
from TTS.tts.utils.measures import alignment_diagonal_score

import IPython
from IPython.display import Audio

os.environ['CUDA_VISIBLE_DEVICES']='1'

def tts(model, text, CONFIG, use_cuda, ap):
    t_1 = time.time()
    # run the model
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True)
    if CONFIG.model == "Tacotron" and not use_gl:
        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
    # plotting
    attn_score = alignment_diagonal_score(torch.FloatTensor(alignment).unsqueeze(0))
    print(f" > {text}")
    IPython.display.display(IPython.display.Audio(waveform, rate=ap.sample_rate))
    fig = plot_alignment(alignment, fig_size=(8, 5))
    IPython.display.display(fig)
    #saving results
    os.makedirs(OUT_FOLDER, exist_ok=True)
    file_name = text[:200].replace(" ", "_").replace(".","") + ".wav"
    out_path = os.path.join(OUT_FOLDER, file_name)
    ap.save_wav(waveform, out_path)
    return attn_score

# Set constants
ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'
MODEL_PATH = ROOT_PATH + '/best_model.pth'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = './hard_sentences/'
CONFIG = load_config(CONFIG_PATH)
SENTENCES_PATH = 'sentences.txt'
use_cuda = True

# Set some config fields manually for testing
# CONFIG.windowing = False
# CONFIG.prenet_dropout = False
# CONFIG.separate_stopnet = True
CONFIG.use_forward_attn = False
# CONFIG.forward_attn_mask = True
# CONFIG.stopnet = True

In [ ]:

# LOAD TTS MODEL
from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes

# multi speaker 
if CONFIG.use_speaker_embedding:
    speakers = json.load(open(f"{ROOT_PATH}/speakers.json", 'r'))
    speakers_idx_to_id = {v: k for k, v in speakers.items()}
else:
    speakers = []
    speaker_id = None

# if the vocabulary was passed, replace the default
if 'characters' in CONFIG.keys():
    symbols, phonemes = make_symbols(**CONFIG.characters)

# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), CONFIG)

# load the audio processor
ap = AudioProcessor(**CONFIG.audio)         


# load model state
if use_cuda:
    cp = torch.load(MODEL_PATH, weights_only=True)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage, weights_only=True)

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()
print(cp['step'])
print(cp['r'])

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])

In [ ]:

model.decoder.max_decoder_steps=3000
attn_scores = []
with open(SENTENCES_PATH, 'r') as f:
    for text in f:
        attn_score = tts(model, text, CONFIG, use_cuda, ap)
        attn_scores.append(attn_score)

In [ ]:

np.mean(attn_scores)

</html>

7.9 KiB Raw Blame History

Features of this notebook¶

7.9 KiB

Raw Blame History