coqui-tts/PlayGround.ipynb at 85da275bdfd28122c1e6094c3e4ee8f234a43a4d

79 KiB

Raw Blame History

None <html lang="en"> <head> </head>

In [2]:

%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import numpy as np
from collections import OrderedDict

%pylab inline
rcParams["figure.figsize"] = (16,5)
sys.path.append('/home/erogol/projects/')

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence

import IPython
from IPython.display import Audio
from utils import *

Populating the interactive namespace from numpy and matplotlib

In [3]:

def tts(model, text, CONFIG, use_cuda, ap, figures=True):                                                                
    waveform, alignment, spectrogram = create_speech(model, text, CONFIG, use_cuda, ap) 
    if figures:                                                                                                         
        visualize(alignment, spectrogram, CONFIG)                                                                       
    IPython.display.display(Audio(waveform, rate=CONFIG.sample_rate))  
    return alignment

In [4]:

ROOT_PATH = '../result/January-26-2018_09:10AM/'
MODEL_PATH = ROOT_PATH + '/checkpoint_27600.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = ROOT_PATH + '/test/'
CONFIG = load_config(CONFIG_PATH)
use_cuda = False

In [5]:

# load the model
model = Tacotron(CONFIG.embedding_size, CONFIG.hidden_size,
        CONFIG.num_mels, CONFIG.num_freq, CONFIG.r)
ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,
                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,
                    CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=80)         

if use_cuda:
    model = torch.nn.DataParallel(model.cuda())
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)
    
    # remove DataPatallel wrapper
    new_state_dict = OrderedDict()
    for k, v in cp['model'].items():
        name = k[7:] # remove `module.`
        new_state_dict[name] = v
    cp['model'] = new_state_dict

In [6]:

model.load_state_dict(cp['model'])
# model.decoder.eval(); ##
model.encoder.eval();
model.postnet.eval();
# model.eval();

In [7]:

sentences = [
    "I try to speak my friend.",
    "I speak more than binary any more.",
    "I try ti implement a new TTS system."
]

In [8]:

# tts(model, sentences[2], CONFIG, use_cuda, ap)

In [9]:

import pandas as pd
df = pd.read_csv('/data/shared/KeithIto/LJSpeech-1.0/metadata.csv', delimiter='|')
print(df.shape)

(13099, 3)

In [28]:

a = torch.randn(32, 10)
b = torch.nn.functional.softmax(torch.autograd.Variable(a), dim=-1)
b.size()

Out[28]:

torch.Size([32, 10])

In [33]:

b.sum(1)

Out[33]:

Variable containing:
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
[torch.FloatTensor of size 32]

In [13]:

aling = tts(model, df.iloc[1, 1], CONFIG, use_cuda, ap)
print(df.iloc[1, 1])

torch.Size([1, 156])
torch.Size([1, 156])

Your browser does not support the audio element.

For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process

No description has been provided for this image

In [39]:

np.unique(aling)

Out[39]:

array([ 1.], dtype=float32)

In [33]:

# import IPython
# import glob

# # wav_files = glob.glob(OUT_FOLDER+'/**/*.wav', recursive=True)
# # assert len(wav_files) > 0
# # IPython.display.Audio(wav_files[1])
# IPython.display.display(IPython.display.Audio(wav, rate=CONFIG.sample_rate))

In [ ]:

</html>

79 KiB Raw Blame History

79 KiB

Raw Blame History