coqui-tts/notebooks/PlayGround.ipynb

79 KiB

None <html lang="en"> <head> </head>
In [2]:
%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import numpy as np
from collections import OrderedDict

%pylab inline
rcParams["figure.figsize"] = (16,5)
sys.path.append('/home/erogol/projects/')

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence

import IPython
from IPython.display import Audio
from utils import *
Populating the interactive namespace from numpy and matplotlib
In [3]:
def tts(model, text, CONFIG, use_cuda, ap, figures=True):                                                                
    waveform, alignment, spectrogram = create_speech(model, text, CONFIG, use_cuda, ap) 
    if figures:                                                                                                         
        visualize(alignment, spectrogram, CONFIG)                                                                       
    IPython.display.display(Audio(waveform, rate=CONFIG.sample_rate))  
    return alignment
In [4]:
ROOT_PATH = '../result/January-26-2018_09:10AM/'
MODEL_PATH = ROOT_PATH + '/checkpoint_27600.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = ROOT_PATH + '/test/'
CONFIG = load_config(CONFIG_PATH)
use_cuda = False
In [5]:
# load the model
model = Tacotron(CONFIG.embedding_size, CONFIG.hidden_size,
        CONFIG.num_mels, CONFIG.num_freq, CONFIG.r)
ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,
                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,
                    CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=80)         

if use_cuda:
    model = torch.nn.DataParallel(model.cuda())
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)
    
    # remove DataPatallel wrapper
    new_state_dict = OrderedDict()
    for k, v in cp['model'].items():
        name = k[7:] # remove `module.`
        new_state_dict[name] = v
    cp['model'] = new_state_dict
In [6]:
model.load_state_dict(cp['model'])
# model.decoder.eval(); ##
model.encoder.eval();
model.postnet.eval();
# model.eval();
In [7]:
sentences = [
    "I try to speak my friend.",
    "I speak more than binary any more.",
    "I try ti implement a new TTS system."
]
In [8]:
# tts(model, sentences[2], CONFIG, use_cuda, ap)
In [9]:
import pandas as pd
df = pd.read_csv('/data/shared/KeithIto/LJSpeech-1.0/metadata.csv', delimiter='|')
print(df.shape)
(13099, 3)
In [28]:
a = torch.randn(32, 10)
b = torch.nn.functional.softmax(torch.autograd.Variable(a), dim=-1)
b.size()
Out[28]:
torch.Size([32, 10])
In [33]:
b.sum(1)
Out[33]:
Variable containing:
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
 1.0000
[torch.FloatTensor of size 32]
In [13]:
aling = tts(model, df.iloc[1, 1], CONFIG, use_cuda, ap)
print(df.iloc[1, 1])
torch.Size([1, 156])
torch.Size([1, 156])
Your browser does not support the audio element.
For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process
No description has been provided for this image
In [39]:
np.unique(aling)
Out[39]:
array([ 1.], dtype=float32)
In [33]:
# import IPython
# import glob

# # wav_files = glob.glob(OUT_FOLDER+'/**/*.wav', recursive=True)
# # assert len(wav_files) > 0
# # IPython.display.Audio(wav_files[1])
# IPython.display.display(IPython.display.Audio(wav, rate=CONFIG.sample_rate))
In [ ]:
 
In [ ]:
 
</html>