mirror of https://github.com/coqui-ai/TTS.git
7.5 MiB
7.5 MiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
%load_ext autoreload %autoreload 2 import os import sys import io import torch import time import numpy as np from collections import OrderedDict %pylab inline rcParams["figure.figsize"] = (16,5) sys.path.append('/home/erogol/projects/') import librosa import librosa.display from TTS.models.tacotron import Tacotron from TTS.layers import * from TTS.utils.data import * from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import load_config from TTS.utils.text import text_to_sequence import IPython from IPython.display import Audio from utils import *
Populating the interactive namespace from numpy and matplotlib
In [2]:
def tts(model, text, CONFIG, use_cuda, ap, figures=True): t_1 = time.time() waveform, alignment, spectrogram = create_speech(model, text, CONFIG, use_cuda, ap) print(" > Run-time: {}".format(time.time() - t_1)) if figures: visualize(alignment, spectrogram, CONFIG) IPython.display.display(Audio(waveform, rate=CONFIG.sample_rate)) return alignment, spectrogram
In [3]:
# Set constants ROOT_PATH = '/home/erogol/projects/models/LJSpeech/April-13-2018_07:06PM-e00bc66/' MODEL_PATH = ROOT_PATH + '/checkpoint_172960.pth.tar' CONFIG_PATH = ROOT_PATH + '/config.json' OUT_FOLDER = ROOT_PATH + '/test/' CONFIG = load_config(CONFIG_PATH) use_cuda = False
In [4]:
# load the model model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels, CONFIG.r) # load the audio processor ap = AudioProcessor(CONFIG.sample_rate, CONFIG.num_mels, CONFIG.min_level_db, CONFIG.frame_shift_ms, CONFIG.frame_length_ms, CONFIG.preemphasis, CONFIG.ref_level_db, CONFIG.num_freq, CONFIG.power, griffin_lim_iters=80) # load model state if use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval()
| > Number of characted : 149
Out[4]:
Tacotron( (embedding): Embedding(149, 256) (encoder): Encoder( (prenet): Prenet( (layers): ModuleList( (0): Linear(in_features=256, out_features=256) (1): Linear(in_features=256, out_features=128) ) (relu): ReLU() (dropout): Dropout(p=0.5) ) (cbhg): CBHG( (relu): ReLU() (conv1d_banks): ModuleList( (0): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(1,), stride=(1,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (1): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(2,), stride=(1,), padding=(1,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (2): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(3,), stride=(1,), padding=(1,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (3): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(4,), stride=(1,), padding=(2,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (4): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (5): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(6,), stride=(1,), padding=(3,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (6): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(7,), stride=(1,), padding=(3,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (7): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(8,), stride=(1,), padding=(4,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (8): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(9,), stride=(1,), padding=(4,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (9): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(10,), stride=(1,), padding=(5,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (10): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(11,), stride=(1,), padding=(5,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (11): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(12,), stride=(1,), padding=(6,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (12): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(13,), stride=(1,), padding=(6,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (13): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(14,), stride=(1,), padding=(7,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (14): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(15,), stride=(1,), padding=(7,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (15): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(16,), stride=(1,), padding=(8,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) ) (max_pool1d): MaxPool1d(kernel_size=2, stride=1, padding=1, dilation=1, ceil_mode=False) (conv1d_projections): ModuleList( (0): BatchNormConv1d( (conv1d): Conv1d (2048, 128, kernel_size=(3,), stride=(1,), padding=(1,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (1): BatchNormConv1d( (conv1d): Conv1d (128, 128, kernel_size=(3,), stride=(1,), padding=(1,), bias=False) (bn): BatchNorm1d(128, eps=0.001, momentum=0.99, affine=True) ) ) (pre_highway): Linear(in_features=128, out_features=128) (highways): ModuleList( (0): Highway( (H): Linear(in_features=128, out_features=128) (T): Linear(in_features=128, out_features=128) (relu): ReLU() (sigmoid): Sigmoid() ) (1): Highway( (H): Linear(in_features=128, out_features=128) (T): Linear(in_features=128, out_features=128) (relu): ReLU() (sigmoid): Sigmoid() ) (2): Highway( (H): Linear(in_features=128, out_features=128) (T): Linear(in_features=128, out_features=128) (relu): ReLU() (sigmoid): Sigmoid() ) (3): Highway( (H): Linear(in_features=128, out_features=128) (T): Linear(in_features=128, out_features=128) (relu): ReLU() (sigmoid): Sigmoid() ) ) (gru): GRU(128, 128, batch_first=True, bidirectional=True) ) ) (decoder): Decoder( (prenet): Prenet( (layers): ModuleList( (0): Linear(in_features=400, out_features=256) (1): Linear(in_features=256, out_features=128) ) (relu): ReLU() (dropout): Dropout(p=0.5) ) (attention_rnn): AttentionRNN( (rnn_cell): GRUCell(384, 256) (alignment_model): BahdanauAttention( (query_layer): Linear(in_features=256, out_features=256) (annot_layer): Linear(in_features=256, out_features=256) (v): Linear(in_features=256, out_features=1) ) ) (project_to_decoder_in): Linear(in_features=512, out_features=256) (decoder_rnns): ModuleList( (0): GRUCell(256, 256) (1): GRUCell(256, 256) ) (proj_to_mel): Linear(in_features=256, out_features=400) ) (postnet): CBHG( (relu): ReLU() (conv1d_banks): ModuleList( (0): BatchNormConv1d( (conv1d): Conv1d (80, 80, kernel_size=(1,), stride=(1,), bias=False) (bn): BatchNorm1d(80, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (1): BatchNormConv1d( (conv1d): Conv1d (80, 80, kernel_size=(2,), stride=(1,), padding=(1,), bias=False) (bn): BatchNorm1d(80, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (2): BatchNormConv1d( (conv1d): Conv1d (80, 80, kernel_size=(3,), stride=(1,), padding=(1,), bias=False) (bn): BatchNorm1d(80, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (3): BatchNormConv1d( (conv1d): Conv1d (80, 80, kernel_size=(4,), stride=(1,), padding=(2,), bias=False) (bn): BatchNorm1d(80, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (4): BatchNormConv1d( (conv1d): Conv1d (80, 80, kernel_size=(5,), stride=(1,), padding=(2,), bias=False) (bn): BatchNorm1d(80, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (5): BatchNormConv1d( (conv1d): Conv1d (80, 80, kernel_size=(6,), stride=(1,), padding=(3,), bias=False) (bn): BatchNorm1d(80, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (6): BatchNormConv1d( (conv1d): Conv1d (80, 80, kernel_size=(7,), stride=(1,), padding=(3,), bias=False) (bn): BatchNorm1d(80, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (7): BatchNormConv1d( (conv1d): Conv1d (80, 80, kernel_size=(8,), stride=(1,), padding=(4,), bias=False) (bn): BatchNorm1d(80, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) ) (max_pool1d): MaxPool1d(kernel_size=2, stride=1, padding=1, dilation=1, ceil_mode=False) (conv1d_projections): ModuleList( (0): BatchNormConv1d( (conv1d): Conv1d (640, 256, kernel_size=(3,), stride=(1,), padding=(1,), bias=False) (bn): BatchNorm1d(256, eps=0.001, momentum=0.99, affine=True) (activation): ReLU() ) (1): BatchNormConv1d( (conv1d): Conv1d (256, 80, kernel_size=(3,), stride=(1,), padding=(1,), bias=False) (bn): BatchNorm1d(80, eps=0.001, momentum=0.99, affine=True) ) ) (pre_highway): Linear(in_features=80, out_features=80) (highways): ModuleList( (0): Highway( (H): Linear(in_features=80, out_features=80) (T): Linear(in_features=80, out_features=80) (relu): ReLU() (sigmoid): Sigmoid() ) (1): Highway( (H): Linear(in_features=80, out_features=80) (T): Linear(in_features=80, out_features=80) (relu): ReLU() (sigmoid): Sigmoid() ) (2): Highway( (H): Linear(in_features=80, out_features=80) (T): Linear(in_features=80, out_features=80) (relu): ReLU() (sigmoid): Sigmoid() ) (3): Highway( (H): Linear(in_features=80, out_features=80) (T): Linear(in_features=80, out_features=80) (relu): ReLU() (sigmoid): Sigmoid() ) ) (gru): GRU(80, 80, batch_first=True, bidirectional=True) ) (last_linear): Linear(in_features=160, out_features=1025) )
EXAMPLES FROM TRAINING SET¶
In [5]:
import pandas as pd df = pd.read_csv('/data/shared/KeithIto/LJSpeech-1.0/metadata_val.csv', delimiter='|')
In [6]:
sentence = df.iloc[2, 1] print(sentence) model.decoder.max_decoder_steps = 250 align, spec = tts(model, sentence, CONFIG, use_cuda, ap)
Latona's findings were also confirmed by Ronald G. Wittmus, another FBI fingerprint expert. > Run-time: 11.577292203903198
Your browser does not support the audio element.
Comparision with https://mycroft.ai/blog/available-voices/¶
In [7]:
sentence = "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." model.decoder.max_decoder_steps = 250 alignment = tts(model, sentence, CONFIG, use_cuda, ap)
> Run-time: 11.755356550216675
Your browser does not support the audio element.
In [8]:
sentence = "Be a voice, not an echo." # 'echo' is not in training set. alignment = tts(model, sentence, CONFIG, use_cuda, ap)
> Run-time: 4.867138147354126
Your browser does not support the audio element.
In [9]:
sentence = "The human voice is the most perfect instrument of all." alignment = tts(model, sentence, CONFIG, use_cuda, ap)
!! Decoder stopped with 'max_decoder_steps'. Something is probably wrong. > Run-time: 29.618252754211426
Your browser does not support the audio element.
In [10]:
sentence = "I'm sorry Dave. I'm afraid I can't do that." alignment = tts(model, sentence, CONFIG, use_cuda, ap)
> Run-time: 9.181858539581299
Your browser does not support the audio element.
In [11]:
sentence = "This cake is great. It's so delicious and moist." alignment = tts(model, sentence, CONFIG, use_cuda, ap)
> Run-time: 10.925827741622925
Your browser does not support the audio element.
Comparison with https://keithito.github.io/audio-samples/¶
In [12]:
sentence = "Generative adversarial network or variational auto-encoder." alignment = tts(model, sentence, CONFIG, use_cuda, ap)
> Run-time: 8.598266839981079
Your browser does not support the audio element.
In [13]:
sentence = "Scientists at the CERN laboratory say they have discovered a new particle." alignment = tts(model, sentence, CONFIG, use_cuda, ap)
> Run-time: 14.10420560836792
Your browser does not support the audio element.
In [14]:
sentence = "here’s a way to measure the acute emotional intelligence that has never gone out of style." alignment = tts(model, sentence, CONFIG, use_cuda, ap)
> Run-time: 12.206540584564209
Your browser does not support the audio element.
In [29]:
sentence = "President Trump met with other leaders at the Group of 20 conference." alignment = tts(model, sentence, CONFIG, use_cuda, ap)
> Run-time: 9.559785842895508
Your browser does not support the audio element.
In [16]:
sentence = "The buses aren't the problem, they actually provide a solution." alignment = tts(model, sentence, CONFIG, use_cuda, ap)
> Run-time: 8.320220232009888
Your browser does not support the audio element.
In [31]:
len("President Trump met with other leaders at the Group of 20 conference.")
Out[31]:
69
In [32]:
alignment[0].shape
Out[32]:
(67, 74)
In [37]:
for i in range(alignment[0].shape[0]): a = alignment[0][i].max() print(a)
0.5278257 0.5526603 0.9558787 0.92132413 0.81367326 0.4858333 0.5570528 0.63596785 0.8613073 0.62610185 0.5429014 0.67617726 0.6459313 0.55958015 0.3836815 0.6041045 0.6867788 0.8378071 0.79800814 0.7517049 0.72250843 0.7627181 0.95597124 0.61695683 0.7325122 0.74086845 0.8736562 0.5194566 0.920689 0.5987127 0.82588756 0.63186014 0.7214943 0.8175821 0.66797817 0.7466511 0.48579764 0.5320349 0.43590546 0.36981428 0.45671675 0.6435481 0.84083676 0.61167735 0.44248602 0.8262451 0.8528732 0.6941957 0.7164757 0.58785707 0.6581132 0.66425616 0.62861 0.9175123 0.7020908 0.53642195 0.51358885 0.60365206 0.8154945 0.94948417 0.57743055 0.92082053 0.44779623 0.6089538 0.44988328 0.60405654 0.46145117
In [33]:
plt.imshow(alignment[0])
Out[33]:
<matplotlib.image.AxesImage at 0x7fdb86305630>