coqui-tts/ReadArticle.ipynb at 551fe0a2c9a12251a1faab0622e55d0407bd7fbb

4.9 KiB

Raw Blame History

None <html lang="en"> <head> </head>

In [ ]:

%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import time
import numpy as np
from collections import OrderedDict
from matplotlib import pylab as plt

%pylab inline
rcParams["figure.figsize"] = (16,5)
sys.path.append('/home/erogol/projects/')

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence

import IPython
from IPython.display import Audio
from utils import *

In [ ]:

ls /data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7

In [ ]:

def tts(model, text, CONFIG, use_cuda, ap, figures=True):
    waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) 
    return waveform

def text2audio(text, model, CONFIG, use_cuda, ap):
    wavs = []
    for sen in text.split('.'):
        if len(sen) < 3:
            continue
        sen+='.'
        sen = sen.strip()
        print(sen)
        wav = tts(model, sen, CONFIG, use_cuda, ap)
        wavs.append(wav)
        wavs.append(np.zeros(10000))
#     audio = np.stack(wavs)
#     IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
    return wavs

In [ ]:

# Set constants
ROOT_PATH = '/data/shared/erogol_models/May-22-2018_03:24PM-loc-sen-attn-e6112f7'
MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = ROOT_PATH + '/test/'
CONFIG = load_config(CONFIG_PATH)
use_cuda = True

In [ ]:

# check_idxs = [50008, 100016, 200032, 266208]
check_idxs = [274480]

In [ ]:

# load the model
model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r)

# load the audio processor

ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,
                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,
                    CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=30)         


for idx in check_idxs:
    MODEL_PATH = MODEL_PATH_TMP.format(idx)
    print(MODEL_PATH)
    
    # load model state
    if use_cuda:
        cp = torch.load(MODEL_PATH)
    else:
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()

    model.decoder.max_decoder_steps = 400
    text = "Voice is natural, voice is human. That’s why we are fascinated with creating usable voice technology for our machines. But to create voice systems, an extremely large amount of voice data is required. Most of the data used by large companies isn’t available to the majority of people. We think that stifles innovation. So we’ve launched Project Common Voice, a project to help make voice recognition open to everyone."
    wavs = text2audio(text, model, CONFIG, use_cuda, ap)

    audio = np.concatenate(wavs)
    IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
    ap.save_wav(audio, 'benchmark_samples/CommonVoice.wav')

</html>

4.9 KiB Raw Blame History Unescape Escape

4.9 KiB

Raw Blame History