coqui-tts/notebooks/ReadArticle.ipynb

4.9 KiB
Raw Blame History

None <html lang="en"> <head> </head>
In [ ]:
%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import time
import numpy as np
from collections import OrderedDict
from matplotlib import pylab as plt

%pylab inline
rcParams["figure.figsize"] = (16,5)
sys.path.append('/home/erogol/projects/')

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence

import IPython
from IPython.display import Audio
from utils import *
In [ ]:
ls /data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7
In [ ]:
def tts(model, text, CONFIG, use_cuda, ap, figures=True):
    waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) 
    return waveform

def text2audio(text, model, CONFIG, use_cuda, ap):
    wavs = []
    for sen in text.split('.'):
        if len(sen) < 3:
            continue
        sen+='.'
        sen = sen.strip()
        print(sen)
        wav = tts(model, sen, CONFIG, use_cuda, ap)
        wavs.append(wav)
        wavs.append(np.zeros(10000))
#     audio = np.stack(wavs)
#     IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
    return wavs
In [ ]:
# Set constants
ROOT_PATH = '/data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7'
MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = ROOT_PATH + '/test/'
CONFIG = load_config(CONFIG_PATH)
use_cuda = True
In [ ]:
# check_idxs = [50008, 100016, 200032, 266208]
check_idxs = [274480]
In [ ]:
# load the model
model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r)

# load the audio processor

ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,
                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,
                    CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=30)         


for idx in check_idxs:
    MODEL_PATH = MODEL_PATH_TMP.format(idx)
    print(MODEL_PATH)
    
    # load model state
    if use_cuda:
        cp = torch.load(MODEL_PATH)
    else:
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()

    model.decoder.max_decoder_steps = 400
    text = "Voice is natural, voice is human. Thats why we are fascinated with creating usable voice technology for our machines. But to create voice systems, an extremely large amount of voice data is required. Most of the data used by large companies isnt available to the majority of people. We think that stifles innovation. So weve launched Project Common Voice, a project to help make voice recognition open to everyone."
    wavs = text2audio(text, model, CONFIG, use_cuda, ap)

    audio = np.concatenate(wavs)
    IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
    ap.save_wav(audio, 'benchmark_samples/CommonVoice.wav')
</html>