coqui-tts/notebooks/ReadArticle.ipynb

1.7 MiB
Raw Blame History

None <html lang="en"> <head> </head>
In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import time
import numpy as np
from collections import OrderedDict
from matplotlib import pylab as plt

%pylab inline
rcParams["figure.figsize"] = (16,5)
sys.path.append('/home/erogol/projects/')

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence

import IPython
from IPython.display import Audio
from utils import *
Populating the interactive namespace from numpy and matplotlib
/home/erogol/miniconda3/envs/pytorch4/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['plt']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
In [2]:
ls /data/shared/erogol_models/May-23-2018_06:20AM-loc-sen-attn-bug-fix-debug
best_model.pth.tar        checkpoint_41360.pth.tar
checkpoint_10152.pth.tar  checkpoint_41736.pth.tar
checkpoint_10528.pth.tar  checkpoint_42112.pth.tar
checkpoint_10904.pth.tar  checkpoint_42488.pth.tar
checkpoint_1128.pth.tar   checkpoint_42864.pth.tar
checkpoint_11280.pth.tar  checkpoint_43240.pth.tar
checkpoint_11656.pth.tar  checkpoint_43616.pth.tar
checkpoint_12032.pth.tar  checkpoint_43992.pth.tar
checkpoint_12408.pth.tar  checkpoint_44368.pth.tar
checkpoint_12784.pth.tar  checkpoint_44744.pth.tar
checkpoint_13160.pth.tar  checkpoint_4512.pth.tar
checkpoint_13536.pth.tar  checkpoint_45120.pth.tar
checkpoint_13912.pth.tar  checkpoint_45496.pth.tar
checkpoint_14288.pth.tar  checkpoint_45872.pth.tar
checkpoint_14664.pth.tar  checkpoint_46248.pth.tar
checkpoint_1504.pth.tar   checkpoint_46624.pth.tar
checkpoint_15040.pth.tar  checkpoint_47000.pth.tar
checkpoint_15416.pth.tar  checkpoint_47376.pth.tar
checkpoint_15792.pth.tar  checkpoint_47752.pth.tar
checkpoint_16168.pth.tar  checkpoint_48128.pth.tar
checkpoint_16544.pth.tar  checkpoint_48504.pth.tar
checkpoint_16920.pth.tar  checkpoint_4888.pth.tar
checkpoint_17296.pth.tar  checkpoint_48880.pth.tar
checkpoint_17672.pth.tar  checkpoint_49256.pth.tar
checkpoint_18048.pth.tar  checkpoint_49632.pth.tar
checkpoint_18424.pth.tar  checkpoint_50008.pth.tar
checkpoint_1880.pth.tar   checkpoint_50384.pth.tar
checkpoint_18800.pth.tar  checkpoint_50760.pth.tar
checkpoint_19176.pth.tar  checkpoint_51136.pth.tar
checkpoint_19552.pth.tar  checkpoint_51512.pth.tar
checkpoint_19928.pth.tar  checkpoint_51888.pth.tar
checkpoint_20304.pth.tar  checkpoint_52264.pth.tar
checkpoint_20680.pth.tar  checkpoint_5264.pth.tar
checkpoint_21056.pth.tar  checkpoint_52640.pth.tar
checkpoint_21432.pth.tar  checkpoint_53016.pth.tar
checkpoint_21808.pth.tar  checkpoint_53392.pth.tar
checkpoint_22184.pth.tar  checkpoint_53768.pth.tar
checkpoint_2256.pth.tar   checkpoint_54144.pth.tar
checkpoint_22560.pth.tar  checkpoint_54520.pth.tar
checkpoint_22936.pth.tar  checkpoint_54896.pth.tar
checkpoint_23312.pth.tar  checkpoint_55272.pth.tar
checkpoint_23688.pth.tar  checkpoint_55648.pth.tar
checkpoint_24064.pth.tar  checkpoint_56024.pth.tar
checkpoint_24440.pth.tar  checkpoint_5640.pth.tar
checkpoint_24816.pth.tar  checkpoint_56400.pth.tar
checkpoint_25192.pth.tar  checkpoint_56776.pth.tar
checkpoint_25568.pth.tar  checkpoint_57152.pth.tar
checkpoint_25944.pth.tar  checkpoint_57528.pth.tar
checkpoint_2632.pth.tar   checkpoint_57904.pth.tar
checkpoint_26320.pth.tar  checkpoint_58280.pth.tar
checkpoint_26696.pth.tar  checkpoint_58656.pth.tar
checkpoint_27072.pth.tar  checkpoint_59032.pth.tar
checkpoint_27448.pth.tar  checkpoint_59408.pth.tar
checkpoint_27824.pth.tar  checkpoint_59784.pth.tar
checkpoint_28200.pth.tar  checkpoint_6016.pth.tar
checkpoint_28576.pth.tar  checkpoint_60160.pth.tar
checkpoint_28952.pth.tar  checkpoint_60536.pth.tar
checkpoint_29328.pth.tar  checkpoint_60912.pth.tar
checkpoint_29704.pth.tar  checkpoint_61288.pth.tar
checkpoint_3008.pth.tar   checkpoint_61664.pth.tar
checkpoint_30080.pth.tar  checkpoint_62040.pth.tar
checkpoint_30456.pth.tar  checkpoint_62416.pth.tar
checkpoint_30832.pth.tar  checkpoint_62792.pth.tar
checkpoint_31208.pth.tar  checkpoint_63168.pth.tar
checkpoint_31584.pth.tar  checkpoint_63544.pth.tar
checkpoint_31960.pth.tar  checkpoint_6392.pth.tar
checkpoint_32336.pth.tar  checkpoint_63920.pth.tar
checkpoint_32712.pth.tar  checkpoint_64296.pth.tar
checkpoint_33088.pth.tar  checkpoint_64672.pth.tar
checkpoint_33464.pth.tar  checkpoint_65048.pth.tar
checkpoint_3384.pth.tar   checkpoint_65424.pth.tar
checkpoint_33840.pth.tar  checkpoint_65800.pth.tar
checkpoint_34216.pth.tar  checkpoint_66176.pth.tar
checkpoint_34592.pth.tar  checkpoint_66552.pth.tar
checkpoint_34968.pth.tar  checkpoint_66928.pth.tar
checkpoint_35344.pth.tar  checkpoint_67304.pth.tar
checkpoint_35720.pth.tar  checkpoint_6768.pth.tar
checkpoint_36096.pth.tar  checkpoint_67680.pth.tar
checkpoint_36472.pth.tar  checkpoint_68056.pth.tar
checkpoint_36848.pth.tar  checkpoint_68432.pth.tar
checkpoint_37224.pth.tar  checkpoint_68808.pth.tar
checkpoint_376.pth.tar    checkpoint_69184.pth.tar
checkpoint_3760.pth.tar   checkpoint_7144.pth.tar
checkpoint_37600.pth.tar  checkpoint_752.pth.tar
checkpoint_37976.pth.tar  checkpoint_7520.pth.tar
checkpoint_38352.pth.tar  checkpoint_7896.pth.tar
checkpoint_38728.pth.tar  checkpoint_8272.pth.tar
checkpoint_39104.pth.tar  checkpoint_8648.pth.tar
checkpoint_39480.pth.tar  checkpoint_9024.pth.tar
checkpoint_39856.pth.tar  checkpoint_9400.pth.tar
checkpoint_40232.pth.tar  checkpoint_9776.pth.tar
checkpoint_40608.pth.tar  checkpoints/
checkpoint_40984.pth.tar  config.json
checkpoint_4136.pth.tar   events.out.tfevents.1527081612.mlc1
In [3]:
def tts(model, text, CONFIG, use_cuda, ap, figures=True):
    waveform, alignment, spectrogram, stop_tokens = create_speech(model, text, CONFIG, use_cuda, ap) 
    return waveform

def text2audio(text, model, CONFIG, use_cuda, ap):
    wavs = []
    for sen in text.split('.'):
        if len(sen) < 3:
            continue
        sen+='.'
        sen = sen.strip()
        print(sen)
        wav = tts(model, sen, CONFIG, use_cuda, ap)
        wavs.append(wav)
        wavs.append(np.zeros(10000))
#     audio = np.stack(wavs)
#     IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
    return wavs
In [4]:
# Set constants
ROOT_PATH = '/data/shared/erogol_models/May-23-2018_06:20AM-loc-sen-attn-bug-fix-debug'
MODEL_PATH_TMP = ROOT_PATH + '/checkpoint_{}.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = ROOT_PATH + '/test/'
CONFIG = load_config(CONFIG_PATH)
use_cuda = True
In [5]:
# check_idxs = [50008, 100016, 200032, 266208]
check_idxs = [69184]
In [6]:
# load the model
model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r)

# load the audio processor

ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,
                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,
                    CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=30)         


for idx in check_idxs:
    MODEL_PATH = MODEL_PATH_TMP.format(idx)
    print(MODEL_PATH)
    
    # load model state
    if use_cuda:
        cp = torch.load(MODEL_PATH)
    else:
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()

    model.decoder.max_decoder_steps = 400
    text = "Voice is natural, voice is human. Thats why we are fascinated with creating usable voice technology for our machines. But to create voice systems, an extremely large amount of voice data is required. Most of the data used by large companies isnt available to the majority of people. We think that stifles innovation. So weve launched Project Common Voice, a project to help make voice recognition open to everyone."
    wavs = text2audio(text, model, CONFIG, use_cuda, ap)

    audio = np.concatenate(wavs)
    IPython.display.display(Audio(audio, rate=CONFIG.sample_rate))  
 | > Number of characters : 149
/data/shared/erogol_models/May-23-2018_06:20AM-loc-sen-attn-bug-fix-debug/checkpoint_69184.pth.tar
Voice is natural, voice is human.
Thats why we are fascinated with creating usable voice technology for our machines.
But to create voice systems, an extremely large amount of voice data is required.
Most of the data used by large companies isnt available to the majority of people.
We think that stifles innovation.
So weve launched Project Common Voice, a project to help make voice recognition open to everyone.
Your browser does not support the audio element.
</html>