best model ever changes

This commit is contained in:
Eren Golge 2018-03-07 06:58:51 -08:00
parent 405fbc434e
commit b4032e8dff
6 changed files with 80 additions and 79 deletions

View File

@ -14,7 +14,7 @@
"epochs": 2000,
"lr": 0.0006,
"warmup_steps": 4000,
"batch_size": 180,
"batch_size": 32,
"r": 5,
"griffin_lim_iters": 60,

View File

@ -26,6 +26,7 @@ class LJSpeechDataset(Dataset):
frame_length_ms, preemphasis, ref_level_db, num_freq, power)
print(" > Reading LJSpeech from - {}".format(root_dir))
print(" | > Number of instances : {}".format(len(self.frames)))
self._sort_frames()
def load_wav(self, filename):
try:
@ -34,6 +35,20 @@ class LJSpeechDataset(Dataset):
except RuntimeError as e:
print(" !! Cannot read file : {}".format(filename))
def _sort_frames(self):
r"""Sort sequences in ascending order"""
lengths = np.array([len(ins[1]) for ins in self.frames])
print(" | > Max length sequence {}".format(np.max(lengths)))
print(" | > Min length sequence {}".format(np.min(lengths)))
print(" | > Avg length sequence {}".format(np.mean(lengths)))
idxs = np.argsort(lengths)
new_frames = [None] * len(lengths)
for i, idx in enumerate(idxs):
new_frames[i] = self.frames[idx]
self.frames = new_frames
def __len__(self):
return len(self.frames)
@ -47,9 +62,17 @@ class LJSpeechDataset(Dataset):
return sample
def get_dummy_data(self):
r"""Get a dummy input for testing"""
return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor)
def collate_fn(self, batch):
r"""
Perform preprocessing and create a final data batch:
1. PAD sequences with the longest sequence in the batch
2. Convert Audio signal to Spectrograms.
3. PAD sequences that can be divided by r.
4. Convert Numpy to Torch tensors.
"""
# Puts each data field into a tensor with outer dimension batch size
if isinstance(batch[0], collections.Mapping):

View File

@ -5,26 +5,27 @@ from torch.nn import functional as F
class BahdanauAttention(nn.Module):
def __init__(self, dim):
def __init__(self, annot_dim, query_dim, hidden_dim):
super(BahdanauAttention, self).__init__()
self.query_layer = nn.Linear(dim, dim, bias=False)
self.tanh = nn.Tanh()
self.v = nn.Linear(dim, 1, bias=False)
self.query_layer = nn.Linear(query_dim, hidden_dim, bias=True)
self.annot_layer = nn.Linear(annot_dim, hidden_dim, bias=True)
self.v = nn.Linear(hidden_dim, 1, bias=False)
def forward(self, query, processed_inputs):
def forward(self, annots, query):
"""
Args:
query: (batch, 1, dim) or (batch, dim)
processed_inputs: (batch, max_time, dim)
Shapes:
- query: (batch, 1, dim) or (batch, dim)
- annots: (batch, max_time, dim)
"""
if query.dim() == 2:
# insert time-axis for broadcasting
query = query.unsqueeze(1)
# (batch, 1, dim)
processed_query = self.query_layer(query)
processed_annots = self.annot_layer(annots)
# (batch, max_time, 1)
alignment = self.v(self.tanh(processed_query + processed_inputs))
alignment = self.v(nn.functional.tanh(processed_query + processed_annots))
# (batch, max_time)
return alignment.squeeze(-1)
@ -34,7 +35,7 @@ def get_mask_from_lengths(inputs, inputs_lengths):
"""Get mask tensor from list of length
Args:
inputs: (batch, max_time, dim)
inputs: Tensor in size (batch, max_time, dim)
inputs_lengths: array like
"""
mask = inputs.data.new(inputs.size(0), inputs.size(1)).byte().zero_()
@ -43,52 +44,48 @@ def get_mask_from_lengths(inputs, inputs_lengths):
return ~mask
class AttentionWrapper(nn.Module):
def __init__(self, rnn_cell, alignment_model,
class AttentionRNN(nn.Module):
def __init__(self, out_dim, annot_dim, memory_dim,
score_mask_value=-float("inf")):
super(AttentionWrapper, self).__init__()
self.rnn_cell = rnn_cell
self.alignment_model = alignment_model
super(AttentionRNN, self).__init__()
self.rnn_cell = nn.GRUCell(annot_dim + memory_dim, out_dim)
self.alignment_model = BahdanauAttention(annot_dim, out_dim, out_dim)
self.score_mask_value = score_mask_value
def forward(self, query, context_vec, cell_state, inputs,
processed_inputs=None, mask=None, inputs_lengths=None):
def forward(self, memory, context, rnn_state, annotations,
mask=None, annotations_lengths=None):
if processed_inputs is None:
processed_inputs = inputs
if inputs_lengths is not None and mask is None:
mask = get_mask_from_lengths(inputs, inputs_lengths)
if annotations_lengths is not None and mask is None:
mask = get_mask_from_lengths(annotations, annotations_lengths)
# Alignment
# (batch, max_time)
# e_{ij} = a(s_{i-1}, h_j)
# import ipdb
# ipdb.set_trace()
alignment = self.alignment_model(cell_state, processed_inputs)
alignment = self.alignment_model(annotations, rnn_state)
# TODO: needs recheck.
if mask is not None:
mask = mask.view(query.size(0), -1)
alignment.data.masked_fill_(mask, self.score_mask_value)
# Normalize context_vec weight
# Normalize context weight
alignment = F.softmax(alignment, dim=-1)
# Attention context vector
# (batch, 1, dim)
# c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j
context_vec = torch.bmm(alignment.unsqueeze(1), inputs)
context_vec = context_vec.squeeze(1)
context = torch.bmm(alignment.unsqueeze(1), annotations)
context = context.squeeze(1)
# Concat input query and previous context_vec context
cell_input = torch.cat((query, context_vec), -1)
#cell_input = cell_input.unsqueeze(1)
# Concat input query and previous context context
rnn_input = torch.cat((memory, context), -1)
#rnn_input = rnn_input.unsqueeze(1)
# Feed it to RNN
# s_i = f(y_{i-1}, c_{i}, s_{i-1})
cell_output = self.rnn_cell(cell_input, cell_state)
rnn_output = self.rnn_cell(rnn_input, rnn_state)
context_vec = context_vec.squeeze(1)
return cell_output, context_vec, alignment
context = context.squeeze(1)
return rnn_output, context, alignment

View File

@ -3,7 +3,7 @@ import torch
from torch.autograd import Variable
from torch import nn
from .attention import BahdanauAttention, AttentionWrapper
from .attention import AttentionRNN
from .attention import get_mask_from_lengths
class Prenet(nn.Module):
@ -219,15 +219,10 @@ class Decoder(nn.Module):
self.memory_dim = memory_dim
self.eps = eps
self.r = r
# input -> |Linear| -> processed_inputs
self.input_layer = nn.Linear(in_features, 256, bias=False)
# memory -> |Prenet| -> processed_memory
self.prenet = Prenet(memory_dim * r, out_features=[256, 128])
# processed_inputs, processed_memory -> |Attention| -> Attention, Alignment, RNN_State
self.attention_rnn = AttentionWrapper(
nn.GRUCell(in_features + 128, 256),
BahdanauAttention(256)
)
self.attention_rnn = AttentionRNN(256, in_features, 128)
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
self.project_to_decoder_in = nn.Linear(256+in_features, 256)
# decoder_RNN_input -> |RNN| -> RNN_state
@ -245,9 +240,9 @@ class Decoder(nn.Module):
Args:
inputs: Encoder outputs.
memory: Decoder memory (autoregression. If None (at eval-time),
memory (None): Decoder memory (autoregression. If None (at eval-time),
decoder outputs are used as decoder inputs.
input_lengths: Encoder output (memory) lengths. If not None, used for
input_lengths (None): input lengths, used for
attention masking.
Shapes:
@ -256,12 +251,11 @@ class Decoder(nn.Module):
"""
B = inputs.size(0)
# TODO: take this segment into Attention module.
processed_inputs = self.input_layer(inputs)
if input_lengths is not None:
mask = get_mask_from_lengths(processed_inputs, input_lengths)
else:
mask = None
# if input_lengths is not None:
# mask = get_mask_from_lengths(processed_inputs, input_lengths)
# else:
# mask = None
# Run greedy decoding if memory is None
greedy = memory is None
@ -300,20 +294,7 @@ class Decoder(nn.Module):
memory_input = initial_memory
while True:
if t > 0:
# using harmonized teacher-forcing.
# from https://arxiv.org/abs/1707.06588
if greedy:
memory_input = outputs[-1]
else:
# combine prev. model output and prev. real target
memory_input = torch.div(outputs[-1] + memory[t-1], 2.0)
memory_input = torch.nn.functional.dropout(memory_input,
0.1,
training=True)
# add a random noise
noise = torch.autograd.Variable(
memory_input.data.new(memory_input.size()).normal_(0.0, 1.0))
memory_input = memory_input + noise
memory_input = outputs[-1] if greedy else memory[t - 1]
# Prenet
processed_memory = self.prenet(memory_input)
@ -321,7 +302,7 @@ class Decoder(nn.Module):
# Attention RNN
attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn(
processed_memory, current_context_vec, attention_rnn_hidden,
inputs, processed_inputs=processed_inputs, mask=mask)
inputs)
# Concat RNN output and attention context vector
decoder_input = self.project_to_decoder_in(

View File

@ -9,11 +9,11 @@ from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
class Tacotron(nn.Module):
def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80,
freq_dim=1025, r=5, padding_idx=None,
use_memory_mask=False):
use_atten_mask=False):
super(Tacotron, self).__init__()
self.mel_dim = mel_dim
self.linear_dim = linear_dim
self.use_memory_mask = use_memory_mask
self.use_atten_mask = use_atten_mask
self.embedding = nn.Embedding(len(symbols), embedding_dim,
padding_idx=padding_idx)
print(" | > Embedding dim : {}".format(len(symbols)))
@ -33,9 +33,7 @@ class Tacotron(nn.Module):
# (B, T', in_dim)
encoder_outputs = self.encoder(inputs)
if self.use_memory_mask:
input_lengths = input_lengths
else:
if not self.use_atten_mask:
input_lengths = None
# (B, T', mel_dim*r)

View File

@ -199,7 +199,7 @@ def evaluate(model, criterion, data_loader, current_step):
model = model.train()
epoch_time = 0
print("\n | > Validation")
print(" | > Validation")
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
progbar = Progbar(len(data_loader.dataset) / c.batch_size)
@ -246,10 +246,10 @@ def evaluate(model, criterion, data_loader, current_step):
('mel_loss', mel_loss.data[0])])
avg_linear_loss += linear_loss.data[0]
avg_mel_loss += avg_mel_loss.data[0]
avg_mel_loss += mel_loss.data[0]
# Diagnostic visualizations
idx = np.random.randint(c.batch_size)
idx = np.random.randint(mel_input.shape[0])
const_spec = linear_output[idx].data.cpu().numpy()
gt_spec = linear_spec_var[idx].data.cpu().numpy()
align_img = alignments[idx].data.cpu().numpy()
@ -270,7 +270,7 @@ def evaluate(model, criterion, data_loader, current_step):
tb.add_audio('ValSampleAudio', audio_signal, current_step,
sample_rate=c.sample_rate)
except:
print("\n > Error at audio signal on TB!!")
print(" | > Error at audio signal on TB!!")
print(audio_signal.max())
print(audio_signal.min())
@ -305,8 +305,8 @@ def main(args):
)
train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
shuffle=True, collate_fn=train_dataset.collate_fn,
drop_last=True, num_workers=c.num_loader_workers,
shuffle=False, collate_fn=train_dataset.collate_fn,
drop_last=False, num_workers=c.num_loader_workers,
pin_memory=True)
val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
@ -325,15 +325,16 @@ def main(args):
)
val_loader = DataLoader(val_dataset, batch_size=c.batch_size,
shuffle=True, collate_fn=val_dataset.collate_fn,
drop_last=True, num_workers= 4,
shuffle=False, collate_fn=val_dataset.collate_fn,
drop_last=False, num_workers= 4,
pin_memory=True)
model = Tacotron(c.embedding_size,
c.hidden_size,
c.num_mels,
c.num_freq,
c.r)
c.r,
use_atten_mask=True)
optimizer = optim.Adam(model.parameters(), lr=c.lr)
@ -352,6 +353,7 @@ def main(args):
start_epoch = 0
args.restore_step = checkpoint['step']
else:
args.restore_step = 0
print("\n > Starting a new training")
if use_cuda: