coqui-tts/Benchmark.ipynb at 6fd61e82b0965381cb6905b57ebf5048f641e71e

17 KiB

Raw Blame History

None <html lang="en"> <head> </head>

This is to test TTS models with benchmark sentences for speech synthesis.

Before running this script please DON'T FORGET:

to set file paths.
to download related model files from TTS and WaveRNN.
to checkout right commit versions (given next to the model) of TTS and WaveRNN.
to set the right paths in the cell below.

Repositories:

TTS: https://github.com/mozilla/TTS
WaveRNN: https://github.com/erogol/WaveRNN

In [ ]:

TTS_PATH = "/home/erogol/projects/"
WAVERNN_PATH ="/home/erogol/projects/"

In [ ]:

%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import time
import json
import numpy as np
from collections import OrderedDict
from matplotlib import pylab as plt

%pylab inline
rcParams["figure.figsize"] = (16,5)

# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config, setup_model
from TTS.utils.text import text_to_sequence
from TTS.utils.synthesis import synthesis
from TTS.utils.visual import visualize

import IPython
from IPython.display import Audio

import os
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [ ]:

def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)
    if CONFIG.model == "Tacotron" and not use_gl:
        # coorect the normalization differences b/w TTS and the Vocoder.
        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
    mel_postnet_spec = ap._denormalize(mel_postnet_spec)
    mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)
    if not use_gl:
        waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400)

    print(" >  Run-time: {}".format(time.time() - t_1))
    if figures:                                                                                                         
        visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec)                                                                       
    IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate']))  
    os.makedirs(OUT_FOLDER, exist_ok=True)
    file_name = text.replace(" ", "_").replace(".","") + ".wav"
    out_path = os.path.join(OUT_FOLDER, file_name)
    ap.save_wav(waveform, out_path)
    return alignment, mel_postnet_spec, stop_tokens, waveform

In [ ]:

# Set constants
ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5099/'
MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'
CONFIG = load_config(CONFIG_PATH)
VOCODER_MODEL_PATH = "/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/checkpoint_433000.pth.tar"
VOCODER_CONFIG_PATH = "/media/erogol/data_ssd/Models/wavernn/ljspeech/mold_ljspeech_best_model/config.json"
VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)
use_cuda = False

# Set some config fields manually for testing
# CONFIG.windowing = False
# CONFIG.prenet_dropout = False
# CONFIG.separate_stopnet = True
CONFIG.use_forward_attn = True
# CONFIG.forward_attn_mask = True
# CONFIG.stopnet = True

# Set the vocoder
use_gl = False # use GL if True
batched_wavernn = True    # use batched wavernn inference if True

In [ ]:

# LOAD TTS MODEL
from utils.text.symbols import symbols, phonemes

# multi speaker 
if CONFIG.use_speaker_embedding:
    speakers = json.load(open(f"{ROOT_PATH}/speakers.json", 'r'))
    speakers_idx_to_id = {v: k for k, v in speakers.items()}
else:
    speakers = []
    speaker_id = None

# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), CONFIG)

# load the audio processor
ap = AudioProcessor(**CONFIG.audio)         


# load model state
if use_cuda:
    cp = torch.load(MODEL_PATH)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()
print(cp['step'])
print(cp['r'])

# set model stepsize
if 'r' in cp:
    model.decoder.set_r(cp['r'])

In [ ]:

# LOAD WAVERNN
if use_gl == False:
    from WaveRNN.models.wavernn import Model
    from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder
    bits = 10
    ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio)    
    wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=VOCODER_CONFIG.mode,
            mulaw=VOCODER_CONFIG.mulaw,
            pad=VOCODER_CONFIG.pad,
            upsample_factors=VOCODER_CONFIG.upsample_factors,
            feat_dims=VOCODER_CONFIG.audio["num_mels"],
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=ap_vocoder.hop_length,
            sample_rate=ap_vocoder.sample_rate,
            use_upsample_net = True,
            use_aux_net = True
        ).cuda()

    check = torch.load(VOCODER_MODEL_PATH)
    wavernn.load_state_dict(check['model'], strict=False)
    if use_cuda:
        wavernn.cuda()
    wavernn.eval();
    print(check['step'])

Comparision with https://mycroft.ai/blog/available-voices/¶

In [ ]:

model.eval()
model.decoder.max_decoder_steps = 2000
speaker_id = None
sentence =  "Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

model.eval()
model.decoder.max_decoder_steps = 2000
sentence =  "Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Be a voice, not an echo."  # 'echo' is not in training set. 
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "The human voice is the most perfect instrument of all."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "I'm sorry Dave. I'm afraid I can't do that."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "This cake is great. It's so delicious and moist."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

Comparison with https://keithito.github.io/audio-samples/¶

In [ ]:

sentence = "Generative adversarial network or variational auto-encoder."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Scientists at the CERN laboratory say they have discovered a new particle."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Here’s a way to measure the acute emotional intelligence that has never gone out of style."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "President Trump met with other leaders at the Group of 20 conference."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "The buses aren't the problem, they actually provide a solution."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

Comparison with https://google.github.io/tacotron/publications/tacotron/index.html ¶

In [ ]:

sentence = "Generative adversarial network or variational auto-encoder."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Basilar membrane and otolaryngology are not auto-correlations."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = " He has read the whole thing."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "He reads books."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Thisss isrealy awhsome."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "This is your internet browser, Firefox."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "This is your internet browser Firefox."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "The quick brown fox jumps over the lazy dog."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Does the quick brown fox jump over the lazy dog?"
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Eren, how are you?"
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

Hard Sentences¶

In [ ]:

sentence = "Encouraged, he started with a minute a day."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "His meditation consisted of “body scanning” which involved focusing his mind and energy on each section of the body from head to toe ."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase the grey matter in the parts of the brain responsible for emotional regulation and learning . "
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "If he decided to watch TV he really watched it."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Often we try to bring about change through sheer effort and we put all of our energy into a new initiative ."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

# for twb dataset
sentence = "In our preparation for Easter, God in his providence offers us each year the season of Lent as a sacramental sign of our conversion."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

</html>

17 KiB Raw Blame History Unescape Escape

Comparision with https://mycroft.ai/blog/available-voices/¶

Comparison with https://keithito.github.io/audio-samples/¶

Comparison with https://google.github.io/tacotron/publications/tacotron/index.html¶

Hard Sentences¶

17 KiB

Raw Blame History

Comparison with https://google.github.io/tacotron/publications/tacotron/index.html ¶