coqui-tts/PlayGround.ipynb at 7d5bcd6ca4486e1f33d6de2533f56861d197a4bd

789 KiB

Raw Blame History

None <html lang="en"> <head> </head>

In [1]:

%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import numpy as np
from collections import OrderedDict

%pylab inline
rcParams["figure.figsize"] = (16,5)
sys.path.append('/home/erogol/projects/')

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence

import IPython
from IPython.display import Audio
from utils import *

Populating the interactive namespace from numpy and matplotlib

In [2]:

def tts(model, text, CONFIG, use_cuda, ap, figures=True):                                                                
    waveform, alignment, spectrogram = create_speech(model, text, CONFIG, use_cuda, ap) 
    if figures:                                                                                                         
        visualize(alignment, spectrogram, CONFIG)                                                                       
    IPython.display.display(Audio(waveform, rate=CONFIG.sample_rate))  
    return alignment, spectrogram

In [12]:

ROOT_PATH = '../result/February-03-2018_08:32PM/'
MODEL_PATH = ROOT_PATH + '/checkpoint_3800.pth.tar'
CONFIG_PATH = ROOT_PATH + '/config.json'
OUT_FOLDER = ROOT_PATH + '/test/'
CONFIG = load_config(CONFIG_PATH)
use_cuda = False

In [13]:

# load the model
model = Tacotron(CONFIG.embedding_size, CONFIG.hidden_size,
        CONFIG.num_mels, CONFIG.num_freq, CONFIG.r)
ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db,
                    CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis,
                    CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=80)         

if use_cuda:
    model = torch.nn.DataParallel(model.cuda())
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)
    
    # remove DataPatallel wrapper
    new_state_dict = OrderedDict()
    for k, v in cp['model'].items():
        name = k[7:] # remove `module.`
        new_state_dict[name] = v
    cp['model'] = new_state_dict

 | > Embedding dim : 149

In [14]:

model.load_state_dict(cp['model'])
# model.decoder.eval(); ##
model.encoder.eval();
model.postnet.eval();
# model.eval();

In [15]:

model.decoder.prenet.dropout.training

Out[15]:

True

In [16]:

sentences = [
    "I try to speak my friend.",
    "I speak more than binary any more.",
    "I try ti implement a new TTS system."
]

In [8]:

# tts(model, sentences[2], CONFIG, use_cuda, ap)

In [17]:

import pandas as pd
df = pd.read_csv('/data/shared/KeithIto/LJSpeech-1.0/metadata.csv', delimiter='|')
print(df.shape)

(13099, 3)

In [18]:

aling = tts(model, df.iloc[1, 1], CONFIG, use_cuda, ap)
print(df.iloc[1, 1])

Warning! doesn't seems to be converged

Your browser does not support the audio element.

For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process

No description has been provided for this image

In [11]:

ap._denormalize(aling[1]).min()

Out[11]:

-100.0

In [12]:

ap._normalize(aling[1]).min()

Out[12]:

0.0

In [13]:

# import IPython
# import glob

# # wav_files = glob.glob(OUT_FOLDER+'/**/*.wav', recursive=True)
# # assert len(wav_files) > 0
# # IPython.display.Audio(wav_files[1])
# IPython.display.display(IPython.display.Audio(wav, rate=CONFIG.sample_rate))

In [ ]:

</html>

789 KiB Raw Blame History

789 KiB

Raw Blame History