mirror of https://github.com/coqui-ai/TTS.git
7.9 KiB
7.9 KiB
None
<html lang="en">
<head>
</head>
</html>
This notebook is to test attention performance of a TTS model on a list of sentences taken from DeepVoice paper.
Features of this notebook¶
- You can see visually how your model performs on each sentence and try to dicern common problems.
- At the end, final attention score would be printed showing the ultimate performace of your model. You can use this value to perform model selection.
- You can change the list of sentences byt providing a different sentence file.
In [ ]:
%load_ext autoreload %autoreload 2 import os, sys import torch import time import numpy as np from matplotlib import pylab as plt %pylab inline plt.rcParams["figure.figsize"] = (16,5) import librosa import librosa.display from TTS.tts.layers import * from TTS.utils.audio import AudioProcessor from TTS.tts.utils.generic_utils import setup_model from TTS.tts.utils.io import load_config from TTS.tts.utils.text import text_to_sequence from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment from TTS.tts.utils.measures import alignment_diagonal_score import IPython from IPython.display import Audio os.environ['CUDA_VISIBLE_DEVICES']='1' def tts(model, text, CONFIG, use_cuda, ap): t_1 = time.time() # run the model waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False, CONFIG.enable_eos_bos_chars, True) if CONFIG.model == "Tacotron" and not use_gl: mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T # plotting attn_score = alignment_diagonal_score(torch.FloatTensor(alignment).unsqueeze(0)) print(f" > {text}") IPython.display.display(IPython.display.Audio(waveform, rate=ap.sample_rate)) fig = plot_alignment(alignment, fig_size=(8, 5)) IPython.display.display(fig) #saving results os.makedirs(OUT_FOLDER, exist_ok=True) file_name = text[:200].replace(" ", "_").replace(".","") + ".wav" out_path = os.path.join(OUT_FOLDER, file_name) ap.save_wav(waveform, out_path) return attn_score # Set constants ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/' MODEL_PATH = ROOT_PATH + '/best_model.pth' CONFIG_PATH = ROOT_PATH + '/config.json' OUT_FOLDER = './hard_sentences/' CONFIG = load_config(CONFIG_PATH) SENTENCES_PATH = 'sentences.txt' use_cuda = True # Set some config fields manually for testing # CONFIG.windowing = False # CONFIG.prenet_dropout = False # CONFIG.separate_stopnet = True CONFIG.use_forward_attn = False # CONFIG.forward_attn_mask = True # CONFIG.stopnet = True
In [ ]:
# LOAD TTS MODEL from TTS.tts.utils.text.symbols import make_symbols, symbols, phonemes # multi speaker if CONFIG.use_speaker_embedding: speakers = json.load(open(f"{ROOT_PATH}/speakers.json", 'r')) speakers_idx_to_id = {v: k for k, v in speakers.items()} else: speakers = [] speaker_id = None # if the vocabulary was passed, replace the default if 'characters' in CONFIG.keys(): symbols, phonemes = make_symbols(**CONFIG.characters) # load the model num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, len(speakers), CONFIG) # load the audio processor ap = AudioProcessor(**CONFIG.audio) # load model state if use_cuda: cp = torch.load(MODEL_PATH, weights_only=True) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage, weights_only=True) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() print(cp['step']) print(cp['r']) # set model stepsize if 'r' in cp: model.decoder.set_r(cp['r'])
In [ ]:
model.decoder.max_decoder_steps=3000 attn_scores = [] with open(SENTENCES_PATH, 'r') as f: for text in f: attn_score = tts(model, text, CONFIG, use_cuda, ap) attn_scores.append(attn_score)
In [ ]:
np.mean(attn_scores)