coqui-tts/notebooks/ReadArticle.ipynb

1.6 MiB
Raw Blame History

None <html lang="en"> <head> </head>

This notebook reads a given article by giving each sentence individually to the network without any state passing. You can also compare different checkpoints below.

In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import time
import numpy as np
from collections import OrderedDict
from matplotlib import pylab as plt

%pylab inline
rcParams["figure.figsize"] = (16,5)
sys.path.append('/home/erogol/Projects/')  # change here if you don't install TTS by setup.py

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence

import IPython
from IPython.display import Audio
from synthesis import *
Populating the interactive namespace from numpy and matplotlib
/home/erogol/miniconda3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
In [2]:
def tts(model, text, CONFIG, use_cuda, ap, figures=True):
    waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) 
    return waveform

def text2audio(text, model, CONFIG, use_cuda, ap):
    wavs = []
    for sen in text.split('.'):
        if len(sen) < 3:
            continue
        sen+='.'
        sen = sen.strip()
        print(sen)
        wav = tts(model, sen, CONFIG, use_cuda, ap)
        wavs.append(wav)
        wavs.append(np.zeros(10000))
#     audio = np.stack(wavs)
#     IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
    return wavs
In [3]:
# Set constants
ROOT_PATH = '/home/erogol/Projects/runs/local_runs/September-26-2018_06+55PM-TTS-attn-smoothing-bgs-sigmoid-wd-231607a/'
MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar'
# MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = ROOT_PATH + '/test'
CONFIG = load_config(CONFIG_PATH)
use_cuda = True
In [4]:
# Try different checkpoints
check_idxs = [150000, 200000, 250000, 300000, 350000, 400000]
In [5]:
# load the model
model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r)

# load the audio processor

ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,
                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms,
                    CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, CONFIG.preemphasis,
                    60)     

for idx in check_idxs:
    MODEL_PATH = MODEL_PATH_TMP.format(idx)
    print(MODEL_PATH)
    
    # load model state
    if use_cuda:
        cp = torch.load(MODEL_PATH)
    else:
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()

    model.decoder.max_decoder_steps = 400
    text = "Voice is natural, voice is human. Thats why we are fascinated with creating usable voice technology for our machines. But to create voice systems, an extremely large amount of voice data is required. Most of the data used by large companies isnt available to the majority of people. We think that stifles innovation. So weve launched Project Common Voice, a project to help make voice recognition open to everyone."
#     text = "Does the quick brown fox jump over the lazy dog?"
    wavs = text2audio(text, model, CONFIG, use_cuda, ap)

    audio = np.concatenate(wavs)
    IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
    ap.save_wav(audio, 'benchmark_samples/CommonVoice.wav')
 | > Number of characters : 149
 > Setting up Audio Processor...
 | > fft size: 2048, hop length: 275, win length: 1100
/home/erogol/Projects/runs/local_runs/September-26-2018_06+55PM-TTS-attn-smoothing-bgs-sigmoid-wd-231607a//best_model.pth.tar
Voice is natural, voice is human.
Thats why we are fascinated with creating usable voice technology for our machines.
But to create voice systems, an extremely large amount of voice data is required.
Most of the data used by large companies isnt available to the majority of people.
We think that stifles innovation.
So weve launched Project Common Voice, a project to help make voice recognition open to everyone.
Your browser does not support the audio element.
</html>