coqui-tts/ReadArticle.ipynb at bc51b81aae01727be75d6afd80409058c96fe350

1.6 MiB

Raw Blame History

None <html lang="en"> <head> </head>

This notebook reads a given article by giving each sentence individually to the network without any state passing. You can also compare different checkpoints below.

In [1]:

%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import time
import numpy as np
from collections import OrderedDict
from matplotlib import pylab as plt

%pylab inline
rcParams["figure.figsize"] = (16,5)
sys.path.append('/home/erogol/Projects/')  # change here if you don't install TTS by setup.py

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence

import IPython
from IPython.display import Audio
from synthesis import *

Populating the interactive namespace from numpy and matplotlib

/home/erogol/miniconda3/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [2]:

def tts(model, text, CONFIG, use_cuda, ap, figures=True):
    waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) 
    return waveform

def text2audio(text, model, CONFIG, use_cuda, ap):
    wavs = []
    for sen in text.split('.'):
        if len(sen) < 3:
            continue
        sen+='.'
        sen = sen.strip()
        print(sen)
        wav = tts(model, sen, CONFIG, use_cuda, ap)
        wavs.append(wav)
        wavs.append(np.zeros(10000))
#     audio = np.stack(wavs)
#     IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
    return wavs

In [3]:

# Set constants
ROOT_PATH = '/home/erogol/Projects/runs/local_runs/September-26-2018_06+55PM-TTS-attn-smoothing-bgs-sigmoid-wd-231607a/'
MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar'
# MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = ROOT_PATH + '/test'
CONFIG = load_config(CONFIG_PATH)
use_cuda = True

In [4]:

# Try different checkpoints
check_idxs = [150000, 200000, 250000, 300000, 350000, 400000]

In [5]:

# load the model
model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r)

# load the audio processor

ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,
                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms,
                    CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, CONFIG.preemphasis,
                    60)     

for idx in check_idxs:
    MODEL_PATH = MODEL_PATH_TMP.format(idx)
    print(MODEL_PATH)
    
    # load model state
    if use_cuda:
        cp = torch.load(MODEL_PATH)
    else:
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()

    model.decoder.max_decoder_steps = 400
    text = "Voice is natural, voice is human. That’s why we are fascinated with creating usable voice technology for our machines. But to create voice systems, an extremely large amount of voice data is required. Most of the data used by large companies isn’t available to the majority of people. We think that stifles innovation. So we’ve launched Project Common Voice, a project to help make voice recognition open to everyone."
#     text = "Does the quick brown fox jump over the lazy dog?"
    wavs = text2audio(text, model, CONFIG, use_cuda, ap)

    audio = np.concatenate(wavs)
    IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
    ap.save_wav(audio, 'benchmark_samples/CommonVoice.wav')

 | > Number of characters : 149
 > Setting up Audio Processor...
 | > fft size: 2048, hop length: 275, win length: 1100
/home/erogol/Projects/runs/local_runs/September-26-2018_06+55PM-TTS-attn-smoothing-bgs-sigmoid-wd-231607a//best_model.pth.tar
Voice is natural, voice is human.
That’s why we are fascinated with creating usable voice technology for our machines.
But to create voice systems, an extremely large amount of voice data is required.
Most of the data used by large companies isn’t available to the majority of people.
We think that stifles innovation.
So we’ve launched Project Common Voice, a project to help make voice recognition open to everyone.

Your browser does not support the audio element.

</html>

1.6 MiB Raw Blame History Unescape Escape

1.6 MiB

Raw Blame History