mirror of https://github.com/coqui-ai/TTS.git
pep8 check
This commit is contained in:
parent
b4a4377875
commit
a9eadd1b8a
|
@ -14,10 +14,10 @@ from TTS.utils.data import (prepare_data, pad_per_step,
|
||||||
class LJSpeechDataset(Dataset):
|
class LJSpeechDataset(Dataset):
|
||||||
|
|
||||||
def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate,
|
def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate,
|
||||||
text_cleaner, num_mels, min_level_db, frame_shift_ms,
|
text_cleaner, num_mels, min_level_db, frame_shift_ms,
|
||||||
frame_length_ms, preemphasis, ref_level_db, num_freq, power,
|
frame_length_ms, preemphasis, ref_level_db, num_freq, power,
|
||||||
min_seq_len=0):
|
min_seq_len=0):
|
||||||
|
|
||||||
with open(csv_file, "r") as f:
|
with open(csv_file, "r") as f:
|
||||||
self.frames = [line.split('|') for line in f]
|
self.frames = [line.split('|') for line in f]
|
||||||
self.root_dir = root_dir
|
self.root_dir = root_dir
|
||||||
|
@ -41,11 +41,11 @@ class LJSpeechDataset(Dataset):
|
||||||
def _sort_frames(self):
|
def _sort_frames(self):
|
||||||
r"""Sort sequences in ascending order"""
|
r"""Sort sequences in ascending order"""
|
||||||
lengths = np.array([len(ins[1]) for ins in self.frames])
|
lengths = np.array([len(ins[1]) for ins in self.frames])
|
||||||
|
|
||||||
print(" | > Max length sequence {}".format(np.max(lengths)))
|
print(" | > Max length sequence {}".format(np.max(lengths)))
|
||||||
print(" | > Min length sequence {}".format(np.min(lengths)))
|
print(" | > Min length sequence {}".format(np.min(lengths)))
|
||||||
print(" | > Avg length sequence {}".format(np.mean(lengths)))
|
print(" | > Avg length sequence {}".format(np.mean(lengths)))
|
||||||
|
|
||||||
idxs = np.argsort(lengths)
|
idxs = np.argsort(lengths)
|
||||||
new_frames = []
|
new_frames = []
|
||||||
ignored = []
|
ignored = []
|
||||||
|
@ -55,9 +55,10 @@ class LJSpeechDataset(Dataset):
|
||||||
ignored.append(idx)
|
ignored.append(idx)
|
||||||
else:
|
else:
|
||||||
new_frames.append(self.frames[idx])
|
new_frames.append(self.frames[idx])
|
||||||
print(" | > {} instances are ignored by min_seq_len ({})".format(len(ignored), self.min_seq_len))
|
print(" | > {} instances are ignored by min_seq_len ({})".format(
|
||||||
|
len(ignored), self.min_seq_len))
|
||||||
self.frames = new_frames
|
self.frames = new_frames
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.frames)
|
return len(self.frames)
|
||||||
|
|
||||||
|
@ -65,7 +66,8 @@ class LJSpeechDataset(Dataset):
|
||||||
wav_name = os.path.join(self.root_dir,
|
wav_name = os.path.join(self.root_dir,
|
||||||
self.frames[idx][0]) + '.wav'
|
self.frames[idx][0]) + '.wav'
|
||||||
text = self.frames[idx][1]
|
text = self.frames[idx][1]
|
||||||
text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32)
|
text = np.asarray(text_to_sequence(
|
||||||
|
text, [self.cleaners]), dtype=np.int32)
|
||||||
wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
|
wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
|
||||||
sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
|
sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
|
||||||
return sample
|
return sample
|
||||||
|
@ -96,13 +98,15 @@ class LJSpeechDataset(Dataset):
|
||||||
|
|
||||||
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
|
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
|
||||||
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
|
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
|
||||||
mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame
|
mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame
|
||||||
|
|
||||||
# compute 'stop token' targets
|
# compute 'stop token' targets
|
||||||
stop_targets = [np.array([0.]*(mel_len-1)) for mel_len in mel_lengths]
|
stop_targets = [np.array([0.]*(mel_len-1))
|
||||||
|
for mel_len in mel_lengths]
|
||||||
|
|
||||||
# PAD stop targets
|
# PAD stop targets
|
||||||
stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step)
|
stop_targets = prepare_stop_target(
|
||||||
|
stop_targets, self.outputs_per_step)
|
||||||
|
|
||||||
# PAD sequences with largest length of the batch
|
# PAD sequences with largest length of the batch
|
||||||
text = prepare_data(text).astype(np.int32)
|
text = prepare_data(text).astype(np.int32)
|
||||||
|
@ -112,7 +116,7 @@ class LJSpeechDataset(Dataset):
|
||||||
linear = prepare_tensor(linear, self.outputs_per_step)
|
linear = prepare_tensor(linear, self.outputs_per_step)
|
||||||
mel = prepare_tensor(mel, self.outputs_per_step)
|
mel = prepare_tensor(mel, self.outputs_per_step)
|
||||||
assert mel.shape[2] == linear.shape[2]
|
assert mel.shape[2] == linear.shape[2]
|
||||||
timesteps = mel.shape[2]
|
timesteps = mel.shape[2]
|
||||||
|
|
||||||
# B x T x D
|
# B x T x D
|
||||||
linear = linear.transpose(0, 2, 1)
|
linear = linear.transpose(0, 2, 1)
|
||||||
|
@ -125,7 +129,7 @@ class LJSpeechDataset(Dataset):
|
||||||
mel = torch.FloatTensor(mel)
|
mel = torch.FloatTensor(mel)
|
||||||
mel_lengths = torch.LongTensor(mel_lengths)
|
mel_lengths = torch.LongTensor(mel_lengths)
|
||||||
stop_targets = torch.FloatTensor(stop_targets)
|
stop_targets = torch.FloatTensor(stop_targets)
|
||||||
|
|
||||||
return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0]
|
return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0]
|
||||||
|
|
||||||
raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
|
raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
|
||||||
|
|
|
@ -1,29 +1,29 @@
|
||||||
{
|
{
|
||||||
"num_mels": 80,
|
"num_mels": 80,
|
||||||
"num_freq": 1024,
|
"num_freq": 1024,
|
||||||
"sample_rate": 20000,
|
"sample_rate": 20000,
|
||||||
"frame_length_ms": 50.0,
|
"frame_length_ms": 50.0,
|
||||||
"frame_shift_ms": 12.5,
|
"frame_shift_ms": 12.5,
|
||||||
"preemphasis": 0.97,
|
"preemphasis": 0.97,
|
||||||
"min_level_db": -100,
|
"min_level_db": -100,
|
||||||
"ref_level_db": 20,
|
"ref_level_db": 20,
|
||||||
"hidden_size": 128,
|
"hidden_size": 128,
|
||||||
"embedding_size": 256,
|
"embedding_size": 256,
|
||||||
"text_cleaner": "english_cleaners",
|
"text_cleaner": "english_cleaners",
|
||||||
|
|
||||||
"epochs": 200,
|
"epochs": 200,
|
||||||
"lr": 0.01,
|
"lr": 0.01,
|
||||||
"lr_patience": 2,
|
"lr_patience": 2,
|
||||||
"lr_decay": 0.5,
|
"lr_decay": 0.5,
|
||||||
"batch_size": 32,
|
"batch_size": 32,
|
||||||
"griffinf_lim_iters": 60,
|
"griffinf_lim_iters": 60,
|
||||||
"power": 1.5,
|
"power": 1.5,
|
||||||
"r": 5,
|
"r": 5,
|
||||||
|
|
||||||
"num_loader_workers": 16,
|
"num_loader_workers": 16,
|
||||||
|
|
||||||
"save_step":1 ,
|
"save_step": 1,
|
||||||
"data_path": "/data/shared/KeithIto/LJSpeech-1.0",
|
"data_path": "/data/shared/KeithIto/LJSpeech-1.0",
|
||||||
"output_path": "result",
|
"output_path": "result",
|
||||||
"log_dir": "/home/erogol/projects/TTS/logs/"
|
"log_dir": "/home/erogol/projects/TTS/logs/"
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,8 @@ class BahdanauAttention(nn.Module):
|
||||||
processed_annots = self.annot_layer(annots)
|
processed_annots = self.annot_layer(annots)
|
||||||
|
|
||||||
# (batch, max_time, 1)
|
# (batch, max_time, 1)
|
||||||
alignment = self.v(nn.functional.tanh(processed_query + processed_annots))
|
alignment = self.v(nn.functional.tanh(
|
||||||
|
processed_query + processed_annots))
|
||||||
|
|
||||||
# (batch, max_time)
|
# (batch, max_time)
|
||||||
return alignment.squeeze(-1)
|
return alignment.squeeze(-1)
|
||||||
|
@ -57,11 +58,11 @@ class AttentionRNN(nn.Module):
|
||||||
|
|
||||||
if annotations_lengths is not None and mask is None:
|
if annotations_lengths is not None and mask is None:
|
||||||
mask = get_mask_from_lengths(annotations, annotations_lengths)
|
mask = get_mask_from_lengths(annotations, annotations_lengths)
|
||||||
|
|
||||||
# Concat input query and previous context context
|
# Concat input query and previous context context
|
||||||
rnn_input = torch.cat((memory, context), -1)
|
rnn_input = torch.cat((memory, context), -1)
|
||||||
#rnn_input = rnn_input.unsqueeze(1)
|
#rnn_input = rnn_input.unsqueeze(1)
|
||||||
|
|
||||||
# Feed it to RNN
|
# Feed it to RNN
|
||||||
# s_i = f(y_{i-1}, c_{i}, s_{i-1})
|
# s_i = f(y_{i-1}, c_{i}, s_{i-1})
|
||||||
rnn_output = self.rnn_cell(rnn_input, rnn_state)
|
rnn_output = self.rnn_cell(rnn_input, rnn_state)
|
||||||
|
@ -85,5 +86,3 @@ class AttentionRNN(nn.Module):
|
||||||
context = torch.bmm(alignment.unsqueeze(1), annotations)
|
context = torch.bmm(alignment.unsqueeze(1), annotations)
|
||||||
context = context.squeeze(1)
|
context = context.squeeze(1)
|
||||||
return rnn_output, context, alignment
|
return rnn_output, context, alignment
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,16 +11,16 @@ from torch import nn
|
||||||
# in_features (int): size of the input vector
|
# in_features (int): size of the input vector
|
||||||
# out_features (int or list): size of each output vector. aka number
|
# out_features (int or list): size of each output vector. aka number
|
||||||
# of predicted frames.
|
# of predicted frames.
|
||||||
# """
|
# """
|
||||||
|
|
||||||
# def __init__(self, in_features, out_features):
|
# def __init__(self, in_features, out_features):
|
||||||
# super(StopProjection, self).__init__()
|
# super(StopProjection, self).__init__()
|
||||||
# self.linear = nn.Linear(in_features, out_features)
|
# self.linear = nn.Linear(in_features, out_features)
|
||||||
# self.dropout = nn.Dropout(0.5)
|
# self.dropout = nn.Dropout(0.5)
|
||||||
# self.sigmoid = nn.Sigmoid()
|
# self.sigmoid = nn.Sigmoid()
|
||||||
|
|
||||||
# def forward(self, inputs):
|
# def forward(self, inputs):
|
||||||
# out = self.dropout(inputs)
|
# out = self.dropout(inputs)
|
||||||
# out = self.linear(out)
|
# out = self.linear(out)
|
||||||
# out = self.sigmoid(out)
|
# out = self.sigmoid(out)
|
||||||
# return out
|
# return out
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
import torch
|
import torch
|
||||||
from torch.nn import functional
|
from torch.nn import functional
|
||||||
from torch.autograd import Variable
|
from torch.autograd import Variable
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
@ -20,10 +20,10 @@ def _sequence_mask(sequence_length, max_len=None):
|
||||||
|
|
||||||
|
|
||||||
class L1LossMasked(nn.Module):
|
class L1LossMasked(nn.Module):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(L1LossMasked, self).__init__()
|
super(L1LossMasked, self).__init__()
|
||||||
|
|
||||||
def forward(self, input, target, length):
|
def forward(self, input, target, length):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
@ -51,7 +51,8 @@ class L1LossMasked(nn.Module):
|
||||||
# losses: (batch, max_len, dim)
|
# losses: (batch, max_len, dim)
|
||||||
losses = losses_flat.view(*target.size())
|
losses = losses_flat.view(*target.size())
|
||||||
# mask: (batch, max_len, 1)
|
# mask: (batch, max_len, 1)
|
||||||
mask = _sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2)
|
mask = _sequence_mask(sequence_length=length,
|
||||||
|
max_len=target.size(1)).unsqueeze(2)
|
||||||
losses = losses * mask.float()
|
losses = losses * mask.float()
|
||||||
loss = losses.sum() / (length.float().sum() * float(target.shape[2]))
|
loss = losses.sum() / (length.float().sum() * float(target.shape[2]))
|
||||||
return loss
|
return loss
|
||||||
|
|
|
@ -6,6 +6,7 @@ from torch import nn
|
||||||
from .attention import AttentionRNN
|
from .attention import AttentionRNN
|
||||||
from .attention import get_mask_from_lengths
|
from .attention import get_mask_from_lengths
|
||||||
|
|
||||||
|
|
||||||
class Prenet(nn.Module):
|
class Prenet(nn.Module):
|
||||||
r""" Prenet as explained at https://arxiv.org/abs/1703.10135.
|
r""" Prenet as explained at https://arxiv.org/abs/1703.10135.
|
||||||
It creates as many layers as given by 'out_features'
|
It creates as many layers as given by 'out_features'
|
||||||
|
@ -14,7 +15,7 @@ class Prenet(nn.Module):
|
||||||
in_features (int): size of the input vector
|
in_features (int): size of the input vector
|
||||||
out_features (int or list): size of each output sample.
|
out_features (int or list): size of each output sample.
|
||||||
If it is a list, for each value, there is created a new layer.
|
If it is a list, for each value, there is created a new layer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, in_features, out_features=[256, 128]):
|
def __init__(self, in_features, out_features=[256, 128]):
|
||||||
super(Prenet, self).__init__()
|
super(Prenet, self).__init__()
|
||||||
|
@ -60,7 +61,7 @@ class BatchNormConv1d(nn.Module):
|
||||||
self.activation = activation
|
self.activation = activation
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
x = self.conv1d(x)
|
x = self.conv1d(x)
|
||||||
if self.activation is not None:
|
if self.activation is not None:
|
||||||
x = self.activation(x)
|
x = self.activation(x)
|
||||||
return self.bn(x)
|
return self.bn(x)
|
||||||
|
@ -116,7 +117,7 @@ class CBHG(nn.Module):
|
||||||
self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
|
self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
|
||||||
|
|
||||||
out_features = [K * in_features] + projections[:-1]
|
out_features = [K * in_features] + projections[:-1]
|
||||||
activations = [self.relu] * (len(projections) - 1)
|
activations = [self.relu] * (len(projections) - 1)
|
||||||
activations += [None]
|
activations += [None]
|
||||||
|
|
||||||
# setup conv1d projection layers
|
# setup conv1d projection layers
|
||||||
|
@ -179,7 +180,7 @@ class CBHG(nn.Module):
|
||||||
|
|
||||||
# (B, T_in, in_features*2)
|
# (B, T_in, in_features*2)
|
||||||
# TODO: replace GRU with convolution as in Deep Voice 3
|
# TODO: replace GRU with convolution as in Deep Voice 3
|
||||||
self.gru.flatten_parameters()
|
self.gru.flatten_parameters()
|
||||||
outputs, _ = self.gru(x)
|
outputs, _ = self.gru(x)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
@ -214,6 +215,7 @@ class Decoder(nn.Module):
|
||||||
r (int): number of outputs per time step.
|
r (int): number of outputs per time step.
|
||||||
eps (float): threshold for detecting the end of a sentence.
|
eps (float): threshold for detecting the end of a sentence.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, in_features, memory_dim, r, eps=0.05, mode='train'):
|
def __init__(self, in_features, memory_dim, r, eps=0.05, mode='train'):
|
||||||
super(Decoder, self).__init__()
|
super(Decoder, self).__init__()
|
||||||
self.mode = mode
|
self.mode = mode
|
||||||
|
@ -251,23 +253,18 @@ class Decoder(nn.Module):
|
||||||
- memory: batch x #mels_pecs x mel_spec_dim
|
- memory: batch x #mels_pecs x mel_spec_dim
|
||||||
"""
|
"""
|
||||||
B = inputs.size(0)
|
B = inputs.size(0)
|
||||||
|
|
||||||
# Run greedy decoding if memory is None
|
# Run greedy decoding if memory is None
|
||||||
greedy = not self.training
|
greedy = not self.training
|
||||||
|
|
||||||
if memory is not None:
|
if memory is not None:
|
||||||
|
|
||||||
# Grouping multiple frames if necessary
|
# Grouping multiple frames if necessary
|
||||||
if memory.size(-1) == self.memory_dim:
|
if memory.size(-1) == self.memory_dim:
|
||||||
memory = memory.view(B, memory.size(1) // self.r, -1)
|
memory = memory.view(B, memory.size(1) // self.r, -1)
|
||||||
" !! Dimension mismatch {} vs {} * {}".format(memory.size(-1),
|
" !! Dimension mismatch {} vs {} * {}".format(memory.size(-1),
|
||||||
self.memory_dim, self.r)
|
self.memory_dim, self.r)
|
||||||
T_decoder = memory.size(1)
|
T_decoder = memory.size(1)
|
||||||
|
|
||||||
# go frame - 0 frames tarting the sequence
|
# go frame - 0 frames tarting the sequence
|
||||||
initial_memory = Variable(
|
initial_memory = Variable(
|
||||||
inputs.data.new(B, self.memory_dim * self.r).zero_())
|
inputs.data.new(B, self.memory_dim * self.r).zero_())
|
||||||
|
|
||||||
# Init decoder states
|
# Init decoder states
|
||||||
attention_rnn_hidden = Variable(
|
attention_rnn_hidden = Variable(
|
||||||
inputs.data.new(B, 256).zero_())
|
inputs.data.new(B, 256).zero_())
|
||||||
|
@ -276,14 +273,11 @@ class Decoder(nn.Module):
|
||||||
for _ in range(len(self.decoder_rnns))]
|
for _ in range(len(self.decoder_rnns))]
|
||||||
current_context_vec = Variable(
|
current_context_vec = Variable(
|
||||||
inputs.data.new(B, 256).zero_())
|
inputs.data.new(B, 256).zero_())
|
||||||
|
|
||||||
# Time first (T_decoder, B, memory_dim)
|
# Time first (T_decoder, B, memory_dim)
|
||||||
if memory is not None:
|
if memory is not None:
|
||||||
memory = memory.transpose(0, 1)
|
memory = memory.transpose(0, 1)
|
||||||
|
|
||||||
outputs = []
|
outputs = []
|
||||||
alignments = []
|
alignments = []
|
||||||
|
|
||||||
t = 0
|
t = 0
|
||||||
memory_input = initial_memory
|
memory_input = initial_memory
|
||||||
while True:
|
while True:
|
||||||
|
@ -291,6 +285,7 @@ class Decoder(nn.Module):
|
||||||
if greedy:
|
if greedy:
|
||||||
memory_input = outputs[-1]
|
memory_input = outputs[-1]
|
||||||
else:
|
else:
|
||||||
|
# TODO: try sampled teacher forcing
|
||||||
# combine prev. model output and prev. real target
|
# combine prev. model output and prev. real target
|
||||||
# memory_input = torch.div(outputs[-1] + memory[t-1], 2.0)
|
# memory_input = torch.div(outputs[-1] + memory[t-1], 2.0)
|
||||||
# add a random noise
|
# add a random noise
|
||||||
|
@ -298,36 +293,26 @@ class Decoder(nn.Module):
|
||||||
# memory_input.data.new(memory_input.size()).normal_(0.0, 0.5))
|
# memory_input.data.new(memory_input.size()).normal_(0.0, 0.5))
|
||||||
# memory_input = memory_input + noise
|
# memory_input = memory_input + noise
|
||||||
memory_input = memory[t-1]
|
memory_input = memory[t-1]
|
||||||
|
|
||||||
# Prenet
|
# Prenet
|
||||||
processed_memory = self.prenet(memory_input)
|
processed_memory = self.prenet(memory_input)
|
||||||
|
|
||||||
# Attention RNN
|
# Attention RNN
|
||||||
attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn(
|
attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn(
|
||||||
processed_memory, current_context_vec, attention_rnn_hidden, inputs)
|
processed_memory, current_context_vec, attention_rnn_hidden, inputs)
|
||||||
|
|
||||||
# Concat RNN output and attention context vector
|
# Concat RNN output and attention context vector
|
||||||
decoder_input = self.project_to_decoder_in(
|
decoder_input = self.project_to_decoder_in(
|
||||||
torch.cat((attention_rnn_hidden, current_context_vec), -1))
|
torch.cat((attention_rnn_hidden, current_context_vec), -1))
|
||||||
|
|
||||||
# Pass through the decoder RNNs
|
# Pass through the decoder RNNs
|
||||||
for idx in range(len(self.decoder_rnns)):
|
for idx in range(len(self.decoder_rnns)):
|
||||||
decoder_rnn_hiddens[idx] = self.decoder_rnns[idx](
|
decoder_rnn_hiddens[idx] = self.decoder_rnns[idx](
|
||||||
decoder_input, decoder_rnn_hiddens[idx])
|
decoder_input, decoder_rnn_hiddens[idx])
|
||||||
# Residual connectinon
|
# Residual connectinon
|
||||||
decoder_input = decoder_rnn_hiddens[idx] + decoder_input
|
decoder_input = decoder_rnn_hiddens[idx] + decoder_input
|
||||||
|
|
||||||
output = decoder_input
|
output = decoder_input
|
||||||
|
|
||||||
|
|
||||||
# predict mel vectors from decoder vectors
|
# predict mel vectors from decoder vectors
|
||||||
output = self.proj_to_mel(output)
|
output = self.proj_to_mel(output)
|
||||||
|
|
||||||
outputs += [output]
|
outputs += [output]
|
||||||
alignments += [alignment]
|
alignments += [alignment]
|
||||||
|
|
||||||
t += 1
|
t += 1
|
||||||
|
|
||||||
if (not greedy and self.training) or (greedy and memory is not None):
|
if (not greedy and self.training) or (greedy and memory is not None):
|
||||||
if t >= T_decoder:
|
if t >= T_decoder:
|
||||||
break
|
break
|
||||||
|
@ -338,15 +323,12 @@ class Decoder(nn.Module):
|
||||||
print(" !! Decoder stopped with 'max_decoder_steps'. \
|
print(" !! Decoder stopped with 'max_decoder_steps'. \
|
||||||
Something is probably wrong.")
|
Something is probably wrong.")
|
||||||
break
|
break
|
||||||
|
|
||||||
assert greedy or len(outputs) == T_decoder
|
assert greedy or len(outputs) == T_decoder
|
||||||
|
|
||||||
# Back to batch first
|
# Back to batch first
|
||||||
alignments = torch.stack(alignments).transpose(0, 1)
|
alignments = torch.stack(alignments).transpose(0, 1)
|
||||||
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
|
outputs = torch.stack(outputs).transpose(0, 1).contiguous()
|
||||||
|
|
||||||
return outputs, alignments
|
return outputs, alignments
|
||||||
|
|
||||||
|
|
||||||
def is_end_of_frames(output, eps=0.2): #0.2
|
def is_end_of_frames(output, eps=0.2): # 0.2
|
||||||
return (output.data <= eps).all()
|
return (output.data <= eps).all()
|
||||||
|
|
|
@ -9,7 +9,6 @@ from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
|
||||||
class Tacotron(nn.Module):
|
class Tacotron(nn.Module):
|
||||||
def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80,
|
def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80,
|
||||||
r=5, padding_idx=None):
|
r=5, padding_idx=None):
|
||||||
|
|
||||||
super(Tacotron, self).__init__()
|
super(Tacotron, self).__init__()
|
||||||
self.r = r
|
self.r = r
|
||||||
self.mel_dim = mel_dim
|
self.mel_dim = mel_dim
|
||||||
|
@ -17,34 +16,23 @@ class Tacotron(nn.Module):
|
||||||
self.embedding = nn.Embedding(len(symbols), embedding_dim,
|
self.embedding = nn.Embedding(len(symbols), embedding_dim,
|
||||||
padding_idx=padding_idx)
|
padding_idx=padding_idx)
|
||||||
print(" | > Embedding dim : {}".format(len(symbols)))
|
print(" | > Embedding dim : {}".format(len(symbols)))
|
||||||
|
|
||||||
# Trying smaller std
|
|
||||||
self.embedding.weight.data.normal_(0, 0.3)
|
self.embedding.weight.data.normal_(0, 0.3)
|
||||||
self.encoder = Encoder(embedding_dim)
|
self.encoder = Encoder(embedding_dim)
|
||||||
self.decoder = Decoder(256, mel_dim, r)
|
self.decoder = Decoder(256, mel_dim, r)
|
||||||
|
|
||||||
self.postnet = CBHG(mel_dim, K=8, projections=[256, mel_dim])
|
self.postnet = CBHG(mel_dim, K=8, projections=[256, mel_dim])
|
||||||
self.last_linear = nn.Linear(mel_dim * 2, linear_dim)
|
self.last_linear = nn.Linear(mel_dim * 2, linear_dim)
|
||||||
|
|
||||||
def forward(self, characters, mel_specs=None):
|
def forward(self, characters, mel_specs=None):
|
||||||
|
|
||||||
B = characters.size(0)
|
B = characters.size(0)
|
||||||
|
|
||||||
inputs = self.embedding(characters)
|
inputs = self.embedding(characters)
|
||||||
# (B, T', in_dim)
|
# batch x time x dim
|
||||||
encoder_outputs = self.encoder(inputs)
|
encoder_outputs = self.encoder(inputs)
|
||||||
|
# batch x time x dim*r
|
||||||
# (B, T', mel_dim*r)
|
|
||||||
mel_outputs, alignments = self.decoder(
|
mel_outputs, alignments = self.decoder(
|
||||||
encoder_outputs, mel_specs)
|
encoder_outputs, mel_specs)
|
||||||
|
|
||||||
# Post net processing below
|
|
||||||
|
|
||||||
# Reshape
|
# Reshape
|
||||||
# (B, T, mel_dim)
|
# batch x time x dim
|
||||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||||
|
|
||||||
linear_outputs = self.postnet(mel_outputs)
|
linear_outputs = self.postnet(mel_outputs)
|
||||||
linear_outputs = self.last_linear(linear_outputs)
|
linear_outputs = self.last_linear(linear_outputs)
|
||||||
|
|
||||||
return mel_outputs, linear_outputs, alignments
|
return mel_outputs, linear_outputs, alignments
|
||||||
|
|
|
@ -288,7 +288,8 @@ class AttentionDecoder(nn.Module):
|
||||||
bf_out = gru2_input + gru2_hidden
|
bf_out = gru2_input + gru2_hidden
|
||||||
|
|
||||||
# Output
|
# Output
|
||||||
output = self.out(bf_out).view(-1, self.num_mels, self.outputs_per_step)
|
output = self.out(bf_out).view(-1, self.num_mels,
|
||||||
|
self.outputs_per_step)
|
||||||
|
|
||||||
return output, d_t, gru1_hidden, gru2_hidden
|
return output, d_t, gru1_hidden, gru2_hidden
|
||||||
|
|
||||||
|
|
|
@ -7,20 +7,23 @@ from matplotlib import pylab as plt
|
||||||
|
|
||||||
hop_length = 250
|
hop_length = 250
|
||||||
|
|
||||||
|
|
||||||
def create_speech(m, s, CONFIG, use_cuda, ap):
|
def create_speech(m, s, CONFIG, use_cuda, ap):
|
||||||
text_cleaner = [CONFIG.text_cleaner]
|
text_cleaner = [CONFIG.text_cleaner]
|
||||||
seq = np.array(text_to_sequence(s, text_cleaner))
|
seq = np.array(text_to_sequence(s, text_cleaner))
|
||||||
|
|
||||||
# mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32)
|
# mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32)
|
||||||
|
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda()
|
chars_var = torch.autograd.Variable(
|
||||||
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda()
|
torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda()
|
||||||
|
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda()
|
||||||
else:
|
else:
|
||||||
chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0)
|
chars_var = torch.autograd.Variable(
|
||||||
|
torch.from_numpy(seq), volatile=True).unsqueeze(0)
|
||||||
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)
|
# mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)
|
||||||
|
|
||||||
mel_out, linear_out, alignments =m.forward(chars_var)
|
mel_out, linear_out, alignments = m.forward(chars_var)
|
||||||
linear_out = linear_out[0].data.cpu().numpy()
|
linear_out = linear_out[0].data.cpu().numpy()
|
||||||
alignment = alignments[0].cpu().data.numpy()
|
alignment = alignments[0].cpu().data.numpy()
|
||||||
spec = ap._denormalize(linear_out)
|
spec = ap._denormalize(linear_out)
|
||||||
|
@ -33,19 +36,18 @@ def create_speech(m, s, CONFIG, use_cuda, ap):
|
||||||
|
|
||||||
def visualize(alignment, spectrogram, CONFIG):
|
def visualize(alignment, spectrogram, CONFIG):
|
||||||
label_fontsize = 16
|
label_fontsize = 16
|
||||||
plt.figure(figsize=(16,16))
|
plt.figure(figsize=(16, 16))
|
||||||
|
|
||||||
plt.subplot(2,1,1)
|
plt.subplot(2, 1, 1)
|
||||||
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
|
||||||
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
|
||||||
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
|
||||||
plt.colorbar()
|
plt.colorbar()
|
||||||
|
|
||||||
plt.subplot(2,1,2)
|
plt.subplot(2, 1, 2)
|
||||||
librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate,
|
librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate,
|
||||||
hop_length=hop_length, x_axis="time", y_axis="linear")
|
hop_length=hop_length, x_axis="time", y_axis="linear")
|
||||||
plt.xlabel("Time", fontsize=label_fontsize)
|
plt.xlabel("Time", fontsize=label_fontsize)
|
||||||
plt.ylabel("Hz", fontsize=label_fontsize)
|
plt.ylabel("Hz", fontsize=label_fontsize)
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.colorbar()
|
plt.colorbar()
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#-*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
from network import *
|
from network import *
|
||||||
from data import inv_spectrogram, find_endpoint, save_wav, spectrogram
|
from data import inv_spectrogram, find_endpoint, save_wav, spectrogram
|
||||||
|
|
|
@ -6,6 +6,7 @@ from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder
|
||||||
|
|
||||||
OUT_PATH = '/tmp/test.pth.tar'
|
OUT_PATH = '/tmp/test.pth.tar'
|
||||||
|
|
||||||
|
|
||||||
class ModelSavingTests(unittest.TestCase):
|
class ModelSavingTests(unittest.TestCase):
|
||||||
|
|
||||||
def save_checkpoint_test(self):
|
def save_checkpoint_test(self):
|
||||||
|
|
|
@ -20,7 +20,7 @@ class PrenetTests(unittest.TestCase):
|
||||||
class CBHGTests(unittest.TestCase):
|
class CBHGTests(unittest.TestCase):
|
||||||
|
|
||||||
def test_in_out(self):
|
def test_in_out(self):
|
||||||
layer = CBHG(128, K= 6, projections=[128, 128], num_highways=2)
|
layer = CBHG(128, K=6, projections=[128, 128], num_highways=2)
|
||||||
dummy_input = T.autograd.Variable(T.rand(4, 8, 128))
|
dummy_input = T.autograd.Variable(T.rand(4, 8, 128))
|
||||||
|
|
||||||
print(layer)
|
print(layer)
|
||||||
|
@ -38,11 +38,11 @@ class DecoderTests(unittest.TestCase):
|
||||||
dummy_memory = T.autograd.Variable(T.rand(4, 2, 80))
|
dummy_memory = T.autograd.Variable(T.rand(4, 2, 80))
|
||||||
|
|
||||||
output, alignment = layer(dummy_input, dummy_memory)
|
output, alignment = layer(dummy_input, dummy_memory)
|
||||||
|
|
||||||
assert output.shape[0] == 4
|
assert output.shape[0] == 4
|
||||||
assert output.shape[1] == 1, "size not {}".format(output.shape[1])
|
assert output.shape[1] == 1, "size not {}".format(output.shape[1])
|
||||||
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
|
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
|
||||||
|
|
||||||
|
|
||||||
class EncoderTests(unittest.TestCase):
|
class EncoderTests(unittest.TestCase):
|
||||||
|
|
||||||
|
@ -56,10 +56,10 @@ class EncoderTests(unittest.TestCase):
|
||||||
assert output.shape[0] == 4
|
assert output.shape[0] == 4
|
||||||
assert output.shape[1] == 8
|
assert output.shape[1] == 8
|
||||||
assert output.shape[2] == 256 # 128 * 2 BiRNN
|
assert output.shape[2] == 256 # 128 * 2 BiRNN
|
||||||
|
|
||||||
|
|
||||||
class L1LossMaskedTests(unittest.TestCase):
|
class L1LossMaskedTests(unittest.TestCase):
|
||||||
|
|
||||||
def test_in_out(self):
|
def test_in_out(self):
|
||||||
layer = L1LossMasked()
|
layer = L1LossMasked()
|
||||||
dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float())
|
dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float())
|
||||||
|
@ -69,7 +69,7 @@ class L1LossMaskedTests(unittest.TestCase):
|
||||||
assert output.shape[0] == 0
|
assert output.shape[0] == 0
|
||||||
assert len(output.shape) == 1
|
assert len(output.shape) == 1
|
||||||
assert output.data[0] == 0.0
|
assert output.data[0] == 0.0
|
||||||
|
|
||||||
dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float())
|
dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float())
|
||||||
dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float())
|
dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float())
|
||||||
dummy_length = T.autograd.Variable((T.ones(4) * 8).long())
|
dummy_length = T.autograd.Variable((T.ones(4) * 8).long())
|
||||||
|
@ -78,7 +78,8 @@ class L1LossMaskedTests(unittest.TestCase):
|
||||||
|
|
||||||
dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float())
|
dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float())
|
||||||
dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float())
|
dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float())
|
||||||
dummy_length = T.autograd.Variable((T.arange(5,9)).long())
|
dummy_length = T.autograd.Variable((T.arange(5, 9)).long())
|
||||||
mask = ((_sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
|
mask = ((_sequence_mask(dummy_length).float() - 1.0)
|
||||||
|
* 100.0).unsqueeze(2)
|
||||||
output = layer(dummy_input + mask, dummy_target, dummy_length)
|
output = layer(dummy_input + mask, dummy_target, dummy_length)
|
||||||
assert output.data[0] == 1.0, "1.0 vs {}".format(output.data[0])
|
assert output.data[0] == 1.0, "1.0 vs {}".format(output.data[0])
|
||||||
|
|
|
@ -10,6 +10,7 @@ from TTS.datasets.LJSpeech import LJSpeechDataset
|
||||||
file_path = os.path.dirname(os.path.realpath(__file__))
|
file_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||||
|
|
||||||
|
|
||||||
class TestDataset(unittest.TestCase):
|
class TestDataset(unittest.TestCase):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
@ -30,7 +31,7 @@ class TestDataset(unittest.TestCase):
|
||||||
c.ref_level_db,
|
c.ref_level_db,
|
||||||
c.num_freq,
|
c.num_freq,
|
||||||
c.power
|
c.power
|
||||||
)
|
)
|
||||||
|
|
||||||
dataloader = DataLoader(dataset, batch_size=2,
|
dataloader = DataLoader(dataset, batch_size=2,
|
||||||
shuffle=True, collate_fn=dataset.collate_fn,
|
shuffle=True, collate_fn=dataset.collate_fn,
|
||||||
|
@ -46,7 +47,7 @@ class TestDataset(unittest.TestCase):
|
||||||
mel_lengths = data[4]
|
mel_lengths = data[4]
|
||||||
stop_target = data[5]
|
stop_target = data[5]
|
||||||
item_idx = data[6]
|
item_idx = data[6]
|
||||||
|
|
||||||
neg_values = text_input[text_input < 0]
|
neg_values = text_input[text_input < 0]
|
||||||
check_count = len(neg_values)
|
check_count = len(neg_values)
|
||||||
assert check_count == 0, \
|
assert check_count == 0, \
|
||||||
|
@ -70,7 +71,7 @@ class TestDataset(unittest.TestCase):
|
||||||
c.ref_level_db,
|
c.ref_level_db,
|
||||||
c.num_freq,
|
c.num_freq,
|
||||||
c.power
|
c.power
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test for batch size 1
|
# Test for batch size 1
|
||||||
dataloader = DataLoader(dataset, batch_size=1,
|
dataloader = DataLoader(dataset, batch_size=1,
|
||||||
|
@ -98,8 +99,8 @@ class TestDataset(unittest.TestCase):
|
||||||
assert stop_target.sum() == 1
|
assert stop_target.sum() == 1
|
||||||
assert len(mel_lengths.shape) == 1
|
assert len(mel_lengths.shape) == 1
|
||||||
assert mel_lengths[0] == mel_input[0].shape[0]
|
assert mel_lengths[0] == mel_input[0].shape[0]
|
||||||
|
|
||||||
# Test for batch size 2
|
# Test for batch size 2
|
||||||
dataloader = DataLoader(dataset, batch_size=2,
|
dataloader = DataLoader(dataset, batch_size=2,
|
||||||
shuffle=False, collate_fn=dataset.collate_fn,
|
shuffle=False, collate_fn=dataset.collate_fn,
|
||||||
drop_last=False, num_workers=c.num_loader_workers)
|
drop_last=False, num_workers=c.num_loader_workers)
|
||||||
|
@ -115,11 +116,11 @@ class TestDataset(unittest.TestCase):
|
||||||
stop_target = data[5]
|
stop_target = data[5]
|
||||||
item_idx = data[6]
|
item_idx = data[6]
|
||||||
|
|
||||||
if mel_lengths[0] > mel_lengths[1]:
|
if mel_lengths[0] > mel_lengths[1]:
|
||||||
idx = 0
|
idx = 0
|
||||||
else:
|
else:
|
||||||
idx = 1
|
idx = 1
|
||||||
|
|
||||||
# check the first item in the batch
|
# check the first item in the batch
|
||||||
assert mel_input[idx, -1].sum() == 0
|
assert mel_input[idx, -1].sum() == 0
|
||||||
assert mel_input[idx, -2].sum() != 0, mel_input
|
assert mel_input[idx, -2].sum() != 0, mel_input
|
||||||
|
@ -130,17 +131,13 @@ class TestDataset(unittest.TestCase):
|
||||||
assert stop_target[idx].sum() == 1
|
assert stop_target[idx].sum() == 1
|
||||||
assert len(mel_lengths.shape) == 1
|
assert len(mel_lengths.shape) == 1
|
||||||
assert mel_lengths[idx] == mel_input[idx].shape[0]
|
assert mel_lengths[idx] == mel_input[idx].shape[0]
|
||||||
|
|
||||||
# check the second itme in the batch
|
# check the second itme in the batch
|
||||||
assert mel_input[1-idx, -1].sum() == 0
|
assert mel_input[1-idx, -1].sum() == 0
|
||||||
assert linear_input[1-idx, -1].sum() == 0
|
assert linear_input[1-idx, -1].sum() == 0
|
||||||
assert stop_target[1-idx, -1] == 1
|
assert stop_target[1-idx, -1] == 1
|
||||||
assert len(mel_lengths.shape) == 1
|
assert len(mel_lengths.shape) == 1
|
||||||
|
|
||||||
# check batch conditions
|
# check batch conditions
|
||||||
assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
|
assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
|
||||||
assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
|
assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
143
train.py
143
train.py
|
@ -35,7 +35,7 @@ parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('--restore_path', type=str,
|
parser.add_argument('--restore_path', type=str,
|
||||||
help='Folder path to checkpoints', default=0)
|
help='Folder path to checkpoints', default=0)
|
||||||
parser.add_argument('--config_path', type=str,
|
parser.add_argument('--config_path', type=str,
|
||||||
help='path to config file for training',)
|
help='path to config file for training',)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# setup output paths and read configs
|
# setup output paths and read configs
|
||||||
|
@ -69,7 +69,7 @@ def train(model, criterion, data_loader, optimizer, epoch):
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
avg_linear_loss = 0
|
avg_linear_loss = 0
|
||||||
avg_mel_loss = 0
|
avg_mel_loss = 0
|
||||||
|
|
||||||
print(" | > Epoch {}/{}".format(epoch, c.epochs))
|
print(" | > Epoch {}/{}".format(epoch, c.epochs))
|
||||||
progbar = Progbar(len(data_loader.dataset) / c.batch_size)
|
progbar = Progbar(len(data_loader.dataset) / c.batch_size)
|
||||||
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
||||||
|
@ -82,8 +82,9 @@ def train(model, criterion, data_loader, optimizer, epoch):
|
||||||
linear_input = data[2]
|
linear_input = data[2]
|
||||||
mel_input = data[3]
|
mel_input = data[3]
|
||||||
mel_lengths = data[4]
|
mel_lengths = data[4]
|
||||||
|
|
||||||
current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1
|
current_step = num_iter + args.restore_step + \
|
||||||
|
epoch * len(data_loader) + 1
|
||||||
|
|
||||||
# setup lr
|
# setup lr
|
||||||
current_lr = lr_decay(c.lr, current_step, c.warmup_steps)
|
current_lr = lr_decay(c.lr, current_step, c.warmup_steps)
|
||||||
|
@ -108,16 +109,16 @@ def train(model, criterion, data_loader, optimizer, epoch):
|
||||||
# forward pass
|
# forward pass
|
||||||
mel_output, linear_output, alignments =\
|
mel_output, linear_output, alignments =\
|
||||||
model.forward(text_input_var, mel_spec_var)
|
model.forward(text_input_var, mel_spec_var)
|
||||||
|
|
||||||
# loss computation
|
# loss computation
|
||||||
mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var)
|
mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var)
|
||||||
linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \
|
linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \
|
||||||
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
||||||
linear_spec_var[: ,: ,:n_priority_freq],
|
linear_spec_var[:, :, :n_priority_freq],
|
||||||
mel_lengths_var)
|
mel_lengths_var)
|
||||||
loss = mel_loss + linear_loss
|
loss = mel_loss + linear_loss
|
||||||
|
|
||||||
# backpass and check the grad norm
|
# backpass and check the grad norm
|
||||||
loss.backward()
|
loss.backward()
|
||||||
grad_norm, skip_flag = check_update(model, 0.5, 100)
|
grad_norm, skip_flag = check_update(model, 0.5, 100)
|
||||||
if skip_flag:
|
if skip_flag:
|
||||||
|
@ -129,9 +130,10 @@ def train(model, criterion, data_loader, optimizer, epoch):
|
||||||
step_time = time.time() - start_time
|
step_time = time.time() - start_time
|
||||||
epoch_time += step_time
|
epoch_time += step_time
|
||||||
|
|
||||||
# update
|
# update
|
||||||
progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
|
progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
|
||||||
('linear_loss', linear_loss.data[0]),
|
('linear_loss',
|
||||||
|
linear_loss.data[0]),
|
||||||
('mel_loss', mel_loss.data[0]),
|
('mel_loss', mel_loss.data[0]),
|
||||||
('grad_norm', grad_norm)])
|
('grad_norm', grad_norm)])
|
||||||
|
|
||||||
|
@ -167,7 +169,8 @@ def train(model, criterion, data_loader, optimizer, epoch):
|
||||||
# Sample audio
|
# Sample audio
|
||||||
audio_signal = linear_output[0].data.cpu().numpy()
|
audio_signal = linear_output[0].data.cpu().numpy()
|
||||||
data_loader.dataset.ap.griffin_lim_iters = 60
|
data_loader.dataset.ap.griffin_lim_iters = 60
|
||||||
audio_signal = data_loader.dataset.ap.inv_spectrogram(audio_signal.T)
|
audio_signal = data_loader.dataset.ap.inv_spectrogram(
|
||||||
|
audio_signal.T)
|
||||||
try:
|
try:
|
||||||
tb.add_audio('SampleAudio', audio_signal, current_step,
|
tb.add_audio('SampleAudio', audio_signal, current_step,
|
||||||
sample_rate=c.sample_rate)
|
sample_rate=c.sample_rate)
|
||||||
|
@ -176,30 +179,30 @@ def train(model, criterion, data_loader, optimizer, epoch):
|
||||||
# print(audio_signal.max())
|
# print(audio_signal.max())
|
||||||
# print(audio_signal.min())
|
# print(audio_signal.min())
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
avg_linear_loss /= (num_iter + 1)
|
avg_linear_loss /= (num_iter + 1)
|
||||||
avg_mel_loss /= (num_iter + 1)
|
avg_mel_loss /= (num_iter + 1)
|
||||||
avg_total_loss = avg_mel_loss + avg_linear_loss
|
avg_total_loss = avg_mel_loss + avg_linear_loss
|
||||||
|
|
||||||
# Plot Training Epoch Stats
|
# Plot Training Epoch Stats
|
||||||
tb.add_scalar('TrainEpochLoss/TotalLoss', loss.data[0], current_step)
|
tb.add_scalar('TrainEpochLoss/TotalLoss', loss.data[0], current_step)
|
||||||
tb.add_scalar('TrainEpochLoss/LinearLoss', linear_loss.data[0], current_step)
|
tb.add_scalar('TrainEpochLoss/LinearLoss',
|
||||||
|
linear_loss.data[0], current_step)
|
||||||
tb.add_scalar('TrainEpochLoss/MelLoss', mel_loss.data[0], current_step)
|
tb.add_scalar('TrainEpochLoss/MelLoss', mel_loss.data[0], current_step)
|
||||||
tb.add_scalar('Time/EpochTime', epoch_time, epoch)
|
tb.add_scalar('Time/EpochTime', epoch_time, epoch)
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
|
|
||||||
return avg_linear_loss, current_step
|
return avg_linear_loss, current_step
|
||||||
|
|
||||||
|
|
||||||
def evaluate(model, criterion, data_loader, current_step):
|
def evaluate(model, criterion, data_loader, current_step):
|
||||||
model = model.eval()
|
model = model.eval()
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
|
|
||||||
print(" | > Validation")
|
print(" | > Validation")
|
||||||
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
||||||
progbar = Progbar(len(data_loader.dataset) / c.eval_batch_size)
|
progbar = Progbar(len(data_loader.dataset) / c.eval_batch_size)
|
||||||
|
|
||||||
avg_linear_loss = 0
|
avg_linear_loss = 0
|
||||||
avg_mel_loss = 0
|
avg_mel_loss = 0
|
||||||
|
|
||||||
|
@ -227,24 +230,26 @@ def evaluate(model, criterion, data_loader, current_step):
|
||||||
linear_spec_var = linear_spec_var.cuda()
|
linear_spec_var = linear_spec_var.cuda()
|
||||||
|
|
||||||
# forward pass
|
# forward pass
|
||||||
mel_output, linear_output, alignments = model.forward(text_input_var, mel_spec_var)
|
mel_output, linear_output, alignments = model.forward(
|
||||||
|
text_input_var, mel_spec_var)
|
||||||
|
|
||||||
# loss computation
|
# loss computation
|
||||||
mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var)
|
mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var)
|
||||||
linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \
|
linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \
|
||||||
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
||||||
linear_spec_var[: ,: ,:n_priority_freq],
|
linear_spec_var[:, :, :n_priority_freq],
|
||||||
mel_lengths_var)
|
mel_lengths_var)
|
||||||
loss = mel_loss + linear_loss
|
loss = mel_loss + linear_loss
|
||||||
|
|
||||||
step_time = time.time() - start_time
|
step_time = time.time() - start_time
|
||||||
epoch_time += step_time
|
epoch_time += step_time
|
||||||
|
|
||||||
# update
|
# update
|
||||||
progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
|
progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
|
||||||
('linear_loss', linear_loss.data[0]),
|
('linear_loss',
|
||||||
|
linear_loss.data[0]),
|
||||||
('mel_loss', mel_loss.data[0])])
|
('mel_loss', mel_loss.data[0])])
|
||||||
|
|
||||||
avg_linear_loss += linear_loss.data[0]
|
avg_linear_loss += linear_loss.data[0]
|
||||||
avg_mel_loss += mel_loss.data[0]
|
avg_mel_loss += mel_loss.data[0]
|
||||||
|
|
||||||
|
@ -257,7 +262,7 @@ def evaluate(model, criterion, data_loader, current_step):
|
||||||
const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap)
|
const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap)
|
||||||
gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap)
|
gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap)
|
||||||
align_img = plot_alignment(align_img)
|
align_img = plot_alignment(align_img)
|
||||||
|
|
||||||
tb.add_image('ValVisual/Reconstruction', const_spec, current_step)
|
tb.add_image('ValVisual/Reconstruction', const_spec, current_step)
|
||||||
tb.add_image('ValVisual/GroundTruth', gt_spec, current_step)
|
tb.add_image('ValVisual/GroundTruth', gt_spec, current_step)
|
||||||
tb.add_image('ValVisual/ValidationAlignment', align_img, current_step)
|
tb.add_image('ValVisual/ValidationAlignment', align_img, current_step)
|
||||||
|
@ -274,61 +279,61 @@ def evaluate(model, criterion, data_loader, current_step):
|
||||||
# print(audio_signal.max())
|
# print(audio_signal.max())
|
||||||
# print(audio_signal.min())
|
# print(audio_signal.min())
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# compute average losses
|
# compute average losses
|
||||||
avg_linear_loss /= (num_iter + 1)
|
avg_linear_loss /= (num_iter + 1)
|
||||||
avg_mel_loss /= (num_iter + 1)
|
avg_mel_loss /= (num_iter + 1)
|
||||||
avg_total_loss = avg_mel_loss + avg_linear_loss
|
avg_total_loss = avg_mel_loss + avg_linear_loss
|
||||||
|
|
||||||
# Plot Learning Stats
|
# Plot Learning Stats
|
||||||
tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step)
|
tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step)
|
||||||
tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step)
|
tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step)
|
||||||
tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step)
|
tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step)
|
||||||
return avg_linear_loss
|
return avg_linear_loss
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
|
||||||
# Setup the dataset
|
# Setup the dataset
|
||||||
train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'),
|
train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'),
|
||||||
os.path.join(c.data_path, 'wavs'),
|
os.path.join(c.data_path, 'wavs'),
|
||||||
c.r,
|
c.r,
|
||||||
c.sample_rate,
|
c.sample_rate,
|
||||||
c.text_cleaner,
|
c.text_cleaner,
|
||||||
c.num_mels,
|
c.num_mels,
|
||||||
c.min_level_db,
|
c.min_level_db,
|
||||||
c.frame_shift_ms,
|
c.frame_shift_ms,
|
||||||
c.frame_length_ms,
|
c.frame_length_ms,
|
||||||
c.preemphasis,
|
c.preemphasis,
|
||||||
c.ref_level_db,
|
c.ref_level_db,
|
||||||
c.num_freq,
|
c.num_freq,
|
||||||
c.power,
|
c.power,
|
||||||
min_seq_len=c.min_seq_len
|
min_seq_len=c.min_seq_len
|
||||||
)
|
)
|
||||||
|
|
||||||
train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
|
train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
|
||||||
shuffle=False, collate_fn=train_dataset.collate_fn,
|
shuffle=False, collate_fn=train_dataset.collate_fn,
|
||||||
drop_last=False, num_workers=c.num_loader_workers,
|
drop_last=False, num_workers=c.num_loader_workers,
|
||||||
pin_memory=True)
|
pin_memory=True)
|
||||||
|
|
||||||
val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
|
val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
|
||||||
os.path.join(c.data_path, 'wavs'),
|
os.path.join(c.data_path, 'wavs'),
|
||||||
c.r,
|
c.r,
|
||||||
c.sample_rate,
|
c.sample_rate,
|
||||||
c.text_cleaner,
|
c.text_cleaner,
|
||||||
c.num_mels,
|
c.num_mels,
|
||||||
c.min_level_db,
|
c.min_level_db,
|
||||||
c.frame_shift_ms,
|
c.frame_shift_ms,
|
||||||
c.frame_length_ms,
|
c.frame_length_ms,
|
||||||
c.preemphasis,
|
c.preemphasis,
|
||||||
c.ref_level_db,
|
c.ref_level_db,
|
||||||
c.num_freq,
|
c.num_freq,
|
||||||
c.power
|
c.power
|
||||||
)
|
)
|
||||||
|
|
||||||
val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size,
|
val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size,
|
||||||
shuffle=False, collate_fn=val_dataset.collate_fn,
|
shuffle=False, collate_fn=val_dataset.collate_fn,
|
||||||
drop_last=False, num_workers= 4,
|
drop_last=False, num_workers=4,
|
||||||
pin_memory=True)
|
pin_memory=True)
|
||||||
|
|
||||||
model = Tacotron(c.embedding_size,
|
model = Tacotron(c.embedding_size,
|
||||||
|
@ -337,11 +342,11 @@ def main(args):
|
||||||
c.r)
|
c.r)
|
||||||
|
|
||||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||||
|
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
criterion = L1LossMasked().cuda()
|
criterion = L1LossMasked().cuda()
|
||||||
else:
|
else:
|
||||||
criterion = L1LossMasked()
|
criterion = L1LossMasked()
|
||||||
|
|
||||||
if args.restore_path:
|
if args.restore_path:
|
||||||
checkpoint = torch.load(args.restore_path)
|
checkpoint = torch.load(args.restore_path)
|
||||||
|
@ -361,20 +366,22 @@ def main(args):
|
||||||
|
|
||||||
num_params = count_parameters(model)
|
num_params = count_parameters(model)
|
||||||
print(" | > Model has {} parameters".format(num_params))
|
print(" | > Model has {} parameters".format(num_params))
|
||||||
|
|
||||||
if not os.path.exists(CHECKPOINT_PATH):
|
if not os.path.exists(CHECKPOINT_PATH):
|
||||||
os.mkdir(CHECKPOINT_PATH)
|
os.mkdir(CHECKPOINT_PATH)
|
||||||
|
|
||||||
if 'best_loss' not in locals():
|
if 'best_loss' not in locals():
|
||||||
best_loss = float('inf')
|
best_loss = float('inf')
|
||||||
|
|
||||||
for epoch in range(0, c.epochs):
|
for epoch in range(0, c.epochs):
|
||||||
train_loss, current_step = train(model, criterion, train_loader, optimizer, epoch)
|
train_loss, current_step = train(
|
||||||
|
model, criterion, train_loader, optimizer, epoch)
|
||||||
val_loss = evaluate(model, criterion, val_loader, current_step)
|
val_loss = evaluate(model, criterion, val_loader, current_step)
|
||||||
best_loss = save_best_model(model, optimizer, val_loss,
|
best_loss = save_best_model(model, optimizer, val_loss,
|
||||||
best_loss, OUT_PATH,
|
best_loss, OUT_PATH,
|
||||||
current_step, epoch)
|
current_step, epoch)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
signal.signal(signal.SIGINT, signal_handler)
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -10,8 +10,8 @@ _mel_basis = None
|
||||||
class AudioProcessor(object):
|
class AudioProcessor(object):
|
||||||
|
|
||||||
def __init__(self, sample_rate, num_mels, min_level_db, frame_shift_ms,
|
def __init__(self, sample_rate, num_mels, min_level_db, frame_shift_ms,
|
||||||
frame_length_ms, preemphasis, ref_level_db, num_freq, power,
|
frame_length_ms, preemphasis, ref_level_db, num_freq, power,
|
||||||
griffin_lim_iters=None):
|
griffin_lim_iters=None):
|
||||||
self.sample_rate = sample_rate
|
self.sample_rate = sample_rate
|
||||||
self.num_mels = num_mels
|
self.num_mels = num_mels
|
||||||
self.min_level_db = min_level_db
|
self.min_level_db = min_level_db
|
||||||
|
@ -23,61 +23,49 @@ class AudioProcessor(object):
|
||||||
self.power = power
|
self.power = power
|
||||||
self.griffin_lim_iters = griffin_lim_iters
|
self.griffin_lim_iters = griffin_lim_iters
|
||||||
|
|
||||||
|
|
||||||
def save_wav(self, wav, path):
|
def save_wav(self, wav, path):
|
||||||
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
||||||
librosa.output.write_wav(path, wav.astype(np.int16), self.sample_rate)
|
librosa.output.write_wav(path, wav.astype(np.int16), self.sample_rate)
|
||||||
|
|
||||||
|
|
||||||
def _linear_to_mel(self, spectrogram):
|
def _linear_to_mel(self, spectrogram):
|
||||||
global _mel_basis
|
global _mel_basis
|
||||||
if _mel_basis is None:
|
if _mel_basis is None:
|
||||||
_mel_basis = self._build_mel_basis()
|
_mel_basis = self._build_mel_basis()
|
||||||
return np.dot(_mel_basis, spectrogram)
|
return np.dot(_mel_basis, spectrogram)
|
||||||
|
|
||||||
|
|
||||||
def _build_mel_basis(self, ):
|
def _build_mel_basis(self, ):
|
||||||
n_fft = (self.num_freq - 1) * 2
|
n_fft = (self.num_freq - 1) * 2
|
||||||
return librosa.filters.mel(self.sample_rate, n_fft, n_mels=self.num_mels)
|
return librosa.filters.mel(self.sample_rate, n_fft, n_mels=self.num_mels)
|
||||||
|
|
||||||
|
|
||||||
def _normalize(self, S):
|
def _normalize(self, S):
|
||||||
return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1)
|
return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1)
|
||||||
|
|
||||||
|
|
||||||
def _denormalize(self, S):
|
def _denormalize(self, S):
|
||||||
return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db
|
return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db
|
||||||
|
|
||||||
|
|
||||||
def _stft_parameters(self, ):
|
def _stft_parameters(self, ):
|
||||||
n_fft = (self.num_freq - 1) * 2
|
n_fft = (self.num_freq - 1) * 2
|
||||||
hop_length = int(self.frame_shift_ms / 1000 * self.sample_rate)
|
hop_length = int(self.frame_shift_ms / 1000 * self.sample_rate)
|
||||||
win_length = int(self.frame_length_ms / 1000 * self.sample_rate)
|
win_length = int(self.frame_length_ms / 1000 * self.sample_rate)
|
||||||
return n_fft, hop_length, win_length
|
return n_fft, hop_length, win_length
|
||||||
|
|
||||||
|
|
||||||
def _amp_to_db(self, x):
|
def _amp_to_db(self, x):
|
||||||
return 20 * np.log10(np.maximum(1e-5, x))
|
return 20 * np.log10(np.maximum(1e-5, x))
|
||||||
|
|
||||||
|
|
||||||
def _db_to_amp(self, x):
|
def _db_to_amp(self, x):
|
||||||
return np.power(10.0, x * 0.05)
|
return np.power(10.0, x * 0.05)
|
||||||
|
|
||||||
|
|
||||||
def apply_preemphasis(self, x):
|
def apply_preemphasis(self, x):
|
||||||
return signal.lfilter([1, -self.preemphasis], [1], x)
|
return signal.lfilter([1, -self.preemphasis], [1], x)
|
||||||
|
|
||||||
|
|
||||||
def apply_inv_preemphasis(self, x):
|
def apply_inv_preemphasis(self, x):
|
||||||
return signal.lfilter([1], [1, -self.preemphasis], x)
|
return signal.lfilter([1], [1, -self.preemphasis], x)
|
||||||
|
|
||||||
|
|
||||||
def spectrogram(self, y):
|
def spectrogram(self, y):
|
||||||
D = self._stft(self.apply_preemphasis(y))
|
D = self._stft(self.apply_preemphasis(y))
|
||||||
S = self._amp_to_db(np.abs(D)) - self.ref_level_db
|
S = self._amp_to_db(np.abs(D)) - self.ref_level_db
|
||||||
return self._normalize(S)
|
return self._normalize(S)
|
||||||
|
|
||||||
|
|
||||||
def inv_spectrogram(self, spectrogram):
|
def inv_spectrogram(self, spectrogram):
|
||||||
'''Converts spectrogram to waveform using librosa'''
|
'''Converts spectrogram to waveform using librosa'''
|
||||||
S = self._denormalize(spectrogram)
|
S = self._denormalize(spectrogram)
|
||||||
|
@ -85,7 +73,6 @@ class AudioProcessor(object):
|
||||||
# Reconstruct phase
|
# Reconstruct phase
|
||||||
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
|
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
|
||||||
|
|
||||||
|
|
||||||
def _griffin_lim(self, S):
|
def _griffin_lim(self, S):
|
||||||
'''librosa implementation of Griffin-Lim
|
'''librosa implementation of Griffin-Lim
|
||||||
Based on https://github.com/librosa/librosa/issues/434
|
Based on https://github.com/librosa/librosa/issues/434
|
||||||
|
@ -98,13 +85,11 @@ class AudioProcessor(object):
|
||||||
y = self._istft(S_complex * angles)
|
y = self._istft(S_complex * angles)
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
def melspectrogram(self, y):
|
def melspectrogram(self, y):
|
||||||
D = self._stft(self.apply_preemphasis(y))
|
D = self._stft(self.apply_preemphasis(y))
|
||||||
S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
|
S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
|
||||||
return self._normalize(S)
|
return self._normalize(S)
|
||||||
|
|
||||||
|
|
||||||
def _stft(self, y):
|
def _stft(self, y):
|
||||||
n_fft, hop_length, win_length = self._stft_parameters()
|
n_fft, hop_length, win_length = self._stft_parameters()
|
||||||
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
|
||||||
|
@ -113,7 +98,6 @@ class AudioProcessor(object):
|
||||||
_, hop_length, win_length = self._stft_parameters()
|
_, hop_length, win_length = self._stft_parameters()
|
||||||
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
|
||||||
|
|
||||||
|
|
||||||
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
|
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
|
||||||
window_length = int(self.sample_rate * min_silence_sec)
|
window_length = int(self.sample_rate * min_silence_sec)
|
||||||
hop_length = int(window_length / 4)
|
hop_length = int(window_length / 4)
|
||||||
|
|
|
@ -17,11 +17,13 @@ def prepare_data(inputs):
|
||||||
def _pad_tensor(x, length):
|
def _pad_tensor(x, length):
|
||||||
_pad = 0
|
_pad = 0
|
||||||
assert x.ndim == 2
|
assert x.ndim == 2
|
||||||
x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad)
|
x = np.pad(x, [[0, 0], [0, length - x.shape[1]]],
|
||||||
|
mode='constant', constant_values=_pad)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
def prepare_tensor(inputs, out_steps):
|
def prepare_tensor(inputs, out_steps):
|
||||||
max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame
|
max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame
|
||||||
remainder = max_len % out_steps
|
remainder = max_len % out_steps
|
||||||
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
|
pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
|
||||||
return np.stack([_pad_tensor(x, pad_len) for x in inputs])
|
return np.stack([_pad_tensor(x, pad_len) for x in inputs])
|
||||||
|
|
|
@ -19,7 +19,7 @@ class AttrDict(dict):
|
||||||
def load_config(config_path):
|
def load_config(config_path):
|
||||||
config = AttrDict()
|
config = AttrDict()
|
||||||
config.update(json.load(open(config_path, "r")))
|
config.update(json.load(open(config_path, "r")))
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
def create_experiment_folder(root_path):
|
def create_experiment_folder(root_path):
|
||||||
|
@ -56,7 +56,7 @@ def _trim_model_state_dict(state_dict):
|
||||||
|
|
||||||
new_state_dict = OrderedDict()
|
new_state_dict = OrderedDict()
|
||||||
for k, v in state_dict.items():
|
for k, v in state_dict.items():
|
||||||
name = k[7:] # remove `module.`
|
name = k[7:] # remove `module.`
|
||||||
new_state_dict[name] = v
|
new_state_dict[name] = v
|
||||||
return new_state_dict
|
return new_state_dict
|
||||||
|
|
||||||
|
@ -90,7 +90,8 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
|
||||||
best_loss = model_loss
|
best_loss = model_loss
|
||||||
bestmodel_path = 'best_model.pth.tar'
|
bestmodel_path = 'best_model.pth.tar'
|
||||||
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
||||||
print("\n | > Best model saving with loss {0:.2f} : {1:}".format(model_loss, bestmodel_path))
|
print("\n | > Best model saving with loss {0:.2f} : {1:}".format(
|
||||||
|
model_loss, bestmodel_path))
|
||||||
torch.save(state, bestmodel_path)
|
torch.save(state, bestmodel_path)
|
||||||
return best_loss
|
return best_loss
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#-*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from TTS.utils.text import cleaners
|
from TTS.utils.text import cleaners
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#-*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#-*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#-*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import inflect
|
import inflect
|
||||||
import re
|
import re
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#-*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
|
@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
def plot_alignment(alignment, info=None):
|
def plot_alignment(alignment, info=None):
|
||||||
fig, ax = plt.subplots(figsize=(16,10))
|
fig, ax = plt.subplots(figsize=(16, 10))
|
||||||
im = ax.imshow(alignment.T, aspect='auto', origin='lower',
|
im = ax.imshow(alignment.T, aspect='auto', origin='lower',
|
||||||
interpolation='none')
|
interpolation='none')
|
||||||
fig.colorbar(im, ax=ax)
|
fig.colorbar(im, ax=ax)
|
||||||
|
|
Loading…
Reference in New Issue