coqui-tts/Benchmark.ipynb at a757c6240e56b1002fdbdc0f02d52f774cd9f65e

13 KiB

Raw Blame History

None <html lang="en"> <head> </head>

This is to test TTS models with benchmark sentences for speech synthesis.

Before running this script please DON'T FORGET:

to set file paths.
to download related model files from TTS and WaveRNN.
to checkout right commit versions (given next to the model) of TTS and WaveRNN.
to set the right paths in the cell below.

Repositories:

TTS: https://github.com/mozilla/TTS
WaveRNN: https://github.com/erogol/WaveRNN

In [ ]:

TTS_PATH = "/home/erogol/projects/"
WAVERNN_PATH ="/home/erogol/projects/"

In [ ]:

%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import time
import json
import numpy as np
from collections import OrderedDict
from matplotlib import pylab as plt

%pylab inline
rcParams["figure.figsize"] = (16,5)

# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config, setup_model
from TTS.utils.text import text_to_sequence
from TTS.utils.synthesis import synthesis
from TTS.utils.visual import visualize

import IPython
from IPython.display import Audio

import os
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [ ]:

def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):
    t_1 = time.time()
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, False, CONFIG.enable_eos_bos_chars)
    if CONFIG.model == "Tacotron" and not use_gl:
        # coorect the normalization differences b/w TTS and the Vocoder.
        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
        mel_postnet_spec = ap._denormalize(mel_postnet_spec)
        mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec)
    if not use_gl:
        waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)

    print(" >  Run-time: {}".format(time.time() - t_1))
    if figures:                                                                                                         
        visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec)                                                                       
    IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate']))  
    os.makedirs(OUT_FOLDER, exist_ok=True)
    file_name = text.replace(" ", "_").replace(".","") + ".wav"
    out_path = os.path.join(OUT_FOLDER, file_name)
    ap.save_wav(waveform, out_path)
    return alignment, mel_postnet_spec, stop_tokens, waveform

In [ ]:

# Set constants
ROOT_PATH = '/media/erogol/data_ssd/Models/libri_tts/5049/'
MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = '/home/erogol/Dropbox/AudioSamples/benchmark_samples/'
CONFIG = load_config(CONFIG_PATH)
VOCODER_MODEL_PATH = "/media/erogol/data_ssd/Models/wavernn/universal/4910/best_model_16K.pth.tar"
VOCODER_CONFIG_PATH = "/media/erogol/data_ssd/Models/wavernn/universal/4910/config_16K.json"
VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)
use_cuda = False

# Set some config fields manually for testing
# CONFIG.windowing = False
# CONFIG.prenet_dropout = False
# CONFIG.separate_stopnet = True
# CONFIG.use_forward_attn = True
# CONFIG.forward_attn_mask = True
# CONFIG.stopnet = True

# Set the vocoder
use_gl = True # use GL if True
batched_wavernn = True    # use batched wavernn inference if True

In [ ]:

# LOAD TTS MODEL
from utils.text.symbols import symbols, phonemes

# multi speaker 
if CONFIG.use_speaker_embedding:
    speakers = json.load(open(f"{ROOT_PATH}/speakers.json", 'r'))
    speakers_idx_to_id = {v: k for k, v in speakers.items()}
else:
    speakers = []
    speaker_id = None

# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), CONFIG)

# load the audio processor
ap = AudioProcessor(**CONFIG.audio)         


# load model state
if use_cuda:
    cp = torch.load(MODEL_PATH)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()
print(cp['step'])
print(cp['r'])

# set model stepsize 
if 'r' in cp:
    model.decoder.set_r(cp['r'])

In [ ]:

# LOAD WAVERNN
if use_gl == False:
    from WaveRNN.models.wavernn import Model
    from WaveRNN.utils.audio import AudioProcessor as AudioProcessorVocoder
    bits = 10
    ap_vocoder = AudioProcessorVocoder(**VOCODER_CONFIG.audio)     
    wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode=VOCODER_CONFIG.mode,
            mulaw=VOCODER_CONFIG.mulaw,
            pad=VOCODER_CONFIG.pad,
            upsample_factors=VOCODER_CONFIG.upsample_factors,
            feat_dims=VOCODER_CONFIG.audio["num_mels"],
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=ap_vocoder.hop_length,
            sample_rate=ap_vocoder.sample_rate,
            use_upsample_net = True,
            use_aux_net = True
        ).cuda()

    check = torch.load(VOCODER_MODEL_PATH)
    wavernn.load_state_dict(check['model'], strict=False)
    if use_cuda:
        wavernn.cuda()
    wavernn.eval();
    print(check['step'])

Comparision with https://mycroft.ai/blog/available-voices/¶

In [ ]:

model.eval()
model.decoder.max_decoder_steps = 2000
speaker_id = 500
sentence =  "Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

model.eval()
model.decoder.max_decoder_steps = 2000
sentence =  "Seine Fuerenden Berater hatten Donald Trump seit Wochen beschworen, berichteten US-Medien: Lassen Sie das mit den Zoellen bleiben."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence =  "Der Klimawandel bedroht die Gletscher im Himalaya."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Zwei Unternehmen verlieren einem Medienbericht zufolge ihre Verträge als Maut-Inkasso-Manager."  # 'echo' is not in training set. 
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Eine Ausländermaut nach dem Geschmack der CSU wird es nicht geben - das bedauert außerhalb der Partei fast niemand."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Angela Merkel ist als Klimakanzlerin gestartet."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

Comparison with https://keithito.github.io/audio-samples/¶

In [ ]:

sentence = "Dann vernachlässigte sie das Thema."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Nun, kurz vor dem Ende, will sie damit noch einmal neu anfangen."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Nun ist der Spieltempel pleite, und manchen Dorfbewohnern fehlt das Geld zum Essen."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence = "Andrea Nahles will in der Fraktion die Vertrauensfrage stellen."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence="Die Erfolge der Grünen bringen eine Reihe Unerfahrener in die Parlamente."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

Comparison with https://google.github.io/tacotron/publications/tacotron/index.html ¶

In [ ]:

sentence="Die Luftfahrtbranche arbeitet daran, CO2-neutral zu werden."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

sentence="Michael Kretschmer versucht seit Monaten, die Bürger zu umgarnen."
align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=True)

In [ ]:

# !zip benchmark_samples/samples.zip benchmark_samples/*

</html>

13 KiB Raw Blame History Unescape Escape

Comparision with https://mycroft.ai/blog/available-voices/¶

Comparison with https://keithito.github.io/audio-samples/¶

Comparison with https://google.github.io/tacotron/publications/tacotron/index.html¶

13 KiB

Raw Blame History

Comparison with https://google.github.io/tacotron/publications/tacotron/index.html ¶