best model ever changes

This commit is contained in:
Eren Golge 2018-03-07 06:58:51 -08:00
parent 52653124e3
commit 664d299fc4
6 changed files with 80 additions and 79 deletions

View File

@ -14,7 +14,7 @@
"epochs": 2000, "epochs": 2000,
"lr": 0.0006, "lr": 0.0006,
"warmup_steps": 4000, "warmup_steps": 4000,
"batch_size": 180, "batch_size": 32,
"r": 5, "r": 5,
"griffin_lim_iters": 60, "griffin_lim_iters": 60,

View File

@ -26,6 +26,7 @@ class LJSpeechDataset(Dataset):
frame_length_ms, preemphasis, ref_level_db, num_freq, power) frame_length_ms, preemphasis, ref_level_db, num_freq, power)
print(" > Reading LJSpeech from - {}".format(root_dir)) print(" > Reading LJSpeech from - {}".format(root_dir))
print(" | > Number of instances : {}".format(len(self.frames))) print(" | > Number of instances : {}".format(len(self.frames)))
self._sort_frames()
def load_wav(self, filename): def load_wav(self, filename):
try: try:
@ -34,6 +35,20 @@ class LJSpeechDataset(Dataset):
except RuntimeError as e: except RuntimeError as e:
print(" !! Cannot read file : {}".format(filename)) print(" !! Cannot read file : {}".format(filename))
def _sort_frames(self):
r"""Sort sequences in ascending order"""
lengths = np.array([len(ins[1]) for ins in self.frames])
print(" | > Max length sequence {}".format(np.max(lengths)))
print(" | > Min length sequence {}".format(np.min(lengths)))
print(" | > Avg length sequence {}".format(np.mean(lengths)))
idxs = np.argsort(lengths)
new_frames = [None] * len(lengths)
for i, idx in enumerate(idxs):
new_frames[i] = self.frames[idx]
self.frames = new_frames
def __len__(self): def __len__(self):
return len(self.frames) return len(self.frames)
@ -47,9 +62,17 @@ class LJSpeechDataset(Dataset):
return sample return sample
def get_dummy_data(self): def get_dummy_data(self):
r"""Get a dummy input for testing"""
return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor) return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor)
def collate_fn(self, batch): def collate_fn(self, batch):
r"""
Perform preprocessing and create a final data batch:
1. PAD sequences with the longest sequence in the batch
2. Convert Audio signal to Spectrograms.
3. PAD sequences that can be divided by r.
4. Convert Numpy to Torch tensors.
"""
# Puts each data field into a tensor with outer dimension batch size # Puts each data field into a tensor with outer dimension batch size
if isinstance(batch[0], collections.Mapping): if isinstance(batch[0], collections.Mapping):

View File

@ -5,26 +5,27 @@ from torch.nn import functional as F
class BahdanauAttention(nn.Module): class BahdanauAttention(nn.Module):
def __init__(self, dim): def __init__(self, annot_dim, query_dim, hidden_dim):
super(BahdanauAttention, self).__init__() super(BahdanauAttention, self).__init__()
self.query_layer = nn.Linear(dim, dim, bias=False) self.query_layer = nn.Linear(query_dim, hidden_dim, bias=True)
self.tanh = nn.Tanh() self.annot_layer = nn.Linear(annot_dim, hidden_dim, bias=True)
self.v = nn.Linear(dim, 1, bias=False) self.v = nn.Linear(hidden_dim, 1, bias=False)
def forward(self, query, processed_inputs): def forward(self, annots, query):
""" """
Args: Shapes:
query: (batch, 1, dim) or (batch, dim) - query: (batch, 1, dim) or (batch, dim)
processed_inputs: (batch, max_time, dim) - annots: (batch, max_time, dim)
""" """
if query.dim() == 2: if query.dim() == 2:
# insert time-axis for broadcasting # insert time-axis for broadcasting
query = query.unsqueeze(1) query = query.unsqueeze(1)
# (batch, 1, dim) # (batch, 1, dim)
processed_query = self.query_layer(query) processed_query = self.query_layer(query)
processed_annots = self.annot_layer(annots)
# (batch, max_time, 1) # (batch, max_time, 1)
alignment = self.v(self.tanh(processed_query + processed_inputs)) alignment = self.v(nn.functional.tanh(processed_query + processed_annots))
# (batch, max_time) # (batch, max_time)
return alignment.squeeze(-1) return alignment.squeeze(-1)
@ -34,7 +35,7 @@ def get_mask_from_lengths(inputs, inputs_lengths):
"""Get mask tensor from list of length """Get mask tensor from list of length
Args: Args:
inputs: (batch, max_time, dim) inputs: Tensor in size (batch, max_time, dim)
inputs_lengths: array like inputs_lengths: array like
""" """
mask = inputs.data.new(inputs.size(0), inputs.size(1)).byte().zero_() mask = inputs.data.new(inputs.size(0), inputs.size(1)).byte().zero_()
@ -43,52 +44,48 @@ def get_mask_from_lengths(inputs, inputs_lengths):
return ~mask return ~mask
class AttentionWrapper(nn.Module): class AttentionRNN(nn.Module):
def __init__(self, rnn_cell, alignment_model, def __init__(self, out_dim, annot_dim, memory_dim,
score_mask_value=-float("inf")): score_mask_value=-float("inf")):
super(AttentionWrapper, self).__init__() super(AttentionRNN, self).__init__()
self.rnn_cell = rnn_cell self.rnn_cell = nn.GRUCell(annot_dim + memory_dim, out_dim)
self.alignment_model = alignment_model self.alignment_model = BahdanauAttention(annot_dim, out_dim, out_dim)
self.score_mask_value = score_mask_value self.score_mask_value = score_mask_value
def forward(self, query, context_vec, cell_state, inputs, def forward(self, memory, context, rnn_state, annotations,
processed_inputs=None, mask=None, inputs_lengths=None): mask=None, annotations_lengths=None):
if processed_inputs is None: if annotations_lengths is not None and mask is None:
processed_inputs = inputs mask = get_mask_from_lengths(annotations, annotations_lengths)
if inputs_lengths is not None and mask is None:
mask = get_mask_from_lengths(inputs, inputs_lengths)
# Alignment # Alignment
# (batch, max_time) # (batch, max_time)
# e_{ij} = a(s_{i-1}, h_j) # e_{ij} = a(s_{i-1}, h_j)
# import ipdb alignment = self.alignment_model(annotations, rnn_state)
# ipdb.set_trace()
alignment = self.alignment_model(cell_state, processed_inputs)
# TODO: needs recheck.
if mask is not None: if mask is not None:
mask = mask.view(query.size(0), -1) mask = mask.view(query.size(0), -1)
alignment.data.masked_fill_(mask, self.score_mask_value) alignment.data.masked_fill_(mask, self.score_mask_value)
# Normalize context_vec weight # Normalize context weight
alignment = F.softmax(alignment, dim=-1) alignment = F.softmax(alignment, dim=-1)
# Attention context vector # Attention context vector
# (batch, 1, dim) # (batch, 1, dim)
# c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j # c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j
context_vec = torch.bmm(alignment.unsqueeze(1), inputs) context = torch.bmm(alignment.unsqueeze(1), annotations)
context_vec = context_vec.squeeze(1) context = context.squeeze(1)
# Concat input query and previous context_vec context # Concat input query and previous context context
cell_input = torch.cat((query, context_vec), -1) rnn_input = torch.cat((memory, context), -1)
#cell_input = cell_input.unsqueeze(1) #rnn_input = rnn_input.unsqueeze(1)
# Feed it to RNN # Feed it to RNN
# s_i = f(y_{i-1}, c_{i}, s_{i-1}) # s_i = f(y_{i-1}, c_{i}, s_{i-1})
cell_output = self.rnn_cell(cell_input, cell_state) rnn_output = self.rnn_cell(rnn_input, rnn_state)
context_vec = context_vec.squeeze(1) context = context.squeeze(1)
return cell_output, context_vec, alignment return rnn_output, context, alignment

View File

@ -3,7 +3,7 @@ import torch
from torch.autograd import Variable from torch.autograd import Variable
from torch import nn from torch import nn
from .attention import BahdanauAttention, AttentionWrapper from .attention import AttentionRNN
from .attention import get_mask_from_lengths from .attention import get_mask_from_lengths
class Prenet(nn.Module): class Prenet(nn.Module):
@ -219,15 +219,10 @@ class Decoder(nn.Module):
self.memory_dim = memory_dim self.memory_dim = memory_dim
self.eps = eps self.eps = eps
self.r = r self.r = r
# input -> |Linear| -> processed_inputs
self.input_layer = nn.Linear(in_features, 256, bias=False)
# memory -> |Prenet| -> processed_memory # memory -> |Prenet| -> processed_memory
self.prenet = Prenet(memory_dim * r, out_features=[256, 128]) self.prenet = Prenet(memory_dim * r, out_features=[256, 128])
# processed_inputs, processed_memory -> |Attention| -> Attention, Alignment, RNN_State # processed_inputs, processed_memory -> |Attention| -> Attention, Alignment, RNN_State
self.attention_rnn = AttentionWrapper( self.attention_rnn = AttentionRNN(256, in_features, 128)
nn.GRUCell(in_features + 128, 256),
BahdanauAttention(256)
)
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
self.project_to_decoder_in = nn.Linear(256+in_features, 256) self.project_to_decoder_in = nn.Linear(256+in_features, 256)
# decoder_RNN_input -> |RNN| -> RNN_state # decoder_RNN_input -> |RNN| -> RNN_state
@ -245,9 +240,9 @@ class Decoder(nn.Module):
Args: Args:
inputs: Encoder outputs. inputs: Encoder outputs.
memory: Decoder memory (autoregression. If None (at eval-time), memory (None): Decoder memory (autoregression. If None (at eval-time),
decoder outputs are used as decoder inputs. decoder outputs are used as decoder inputs.
input_lengths: Encoder output (memory) lengths. If not None, used for input_lengths (None): input lengths, used for
attention masking. attention masking.
Shapes: Shapes:
@ -256,12 +251,11 @@ class Decoder(nn.Module):
""" """
B = inputs.size(0) B = inputs.size(0)
# TODO: take this segment into Attention module.
processed_inputs = self.input_layer(inputs) # if input_lengths is not None:
if input_lengths is not None: # mask = get_mask_from_lengths(processed_inputs, input_lengths)
mask = get_mask_from_lengths(processed_inputs, input_lengths) # else:
else: # mask = None
mask = None
# Run greedy decoding if memory is None # Run greedy decoding if memory is None
greedy = memory is None greedy = memory is None
@ -300,20 +294,7 @@ class Decoder(nn.Module):
memory_input = initial_memory memory_input = initial_memory
while True: while True:
if t > 0: if t > 0:
# using harmonized teacher-forcing. memory_input = outputs[-1] if greedy else memory[t - 1]
# from https://arxiv.org/abs/1707.06588
if greedy:
memory_input = outputs[-1]
else:
# combine prev. model output and prev. real target
memory_input = torch.div(outputs[-1] + memory[t-1], 2.0)
memory_input = torch.nn.functional.dropout(memory_input,
0.1,
training=True)
# add a random noise
noise = torch.autograd.Variable(
memory_input.data.new(memory_input.size()).normal_(0.0, 1.0))
memory_input = memory_input + noise
# Prenet # Prenet
processed_memory = self.prenet(memory_input) processed_memory = self.prenet(memory_input)
@ -321,7 +302,7 @@ class Decoder(nn.Module):
# Attention RNN # Attention RNN
attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn( attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn(
processed_memory, current_context_vec, attention_rnn_hidden, processed_memory, current_context_vec, attention_rnn_hidden,
inputs, processed_inputs=processed_inputs, mask=mask) inputs)
# Concat RNN output and attention context vector # Concat RNN output and attention context vector
decoder_input = self.project_to_decoder_in( decoder_input = self.project_to_decoder_in(

View File

@ -9,11 +9,11 @@ from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
class Tacotron(nn.Module): class Tacotron(nn.Module):
def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80, def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80,
freq_dim=1025, r=5, padding_idx=None, freq_dim=1025, r=5, padding_idx=None,
use_memory_mask=False): use_atten_mask=False):
super(Tacotron, self).__init__() super(Tacotron, self).__init__()
self.mel_dim = mel_dim self.mel_dim = mel_dim
self.linear_dim = linear_dim self.linear_dim = linear_dim
self.use_memory_mask = use_memory_mask self.use_atten_mask = use_atten_mask
self.embedding = nn.Embedding(len(symbols), embedding_dim, self.embedding = nn.Embedding(len(symbols), embedding_dim,
padding_idx=padding_idx) padding_idx=padding_idx)
print(" | > Embedding dim : {}".format(len(symbols))) print(" | > Embedding dim : {}".format(len(symbols)))
@ -33,9 +33,7 @@ class Tacotron(nn.Module):
# (B, T', in_dim) # (B, T', in_dim)
encoder_outputs = self.encoder(inputs) encoder_outputs = self.encoder(inputs)
if self.use_memory_mask: if not self.use_atten_mask:
input_lengths = input_lengths
else:
input_lengths = None input_lengths = None
# (B, T', mel_dim*r) # (B, T', mel_dim*r)

View File

@ -199,7 +199,7 @@ def evaluate(model, criterion, data_loader, current_step):
model = model.train() model = model.train()
epoch_time = 0 epoch_time = 0
print("\n | > Validation") print(" | > Validation")
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
progbar = Progbar(len(data_loader.dataset) / c.batch_size) progbar = Progbar(len(data_loader.dataset) / c.batch_size)
@ -246,10 +246,10 @@ def evaluate(model, criterion, data_loader, current_step):
('mel_loss', mel_loss.data[0])]) ('mel_loss', mel_loss.data[0])])
avg_linear_loss += linear_loss.data[0] avg_linear_loss += linear_loss.data[0]
avg_mel_loss += avg_mel_loss.data[0] avg_mel_loss += mel_loss.data[0]
# Diagnostic visualizations # Diagnostic visualizations
idx = np.random.randint(c.batch_size) idx = np.random.randint(mel_input.shape[0])
const_spec = linear_output[idx].data.cpu().numpy() const_spec = linear_output[idx].data.cpu().numpy()
gt_spec = linear_spec_var[idx].data.cpu().numpy() gt_spec = linear_spec_var[idx].data.cpu().numpy()
align_img = alignments[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy()
@ -270,7 +270,7 @@ def evaluate(model, criterion, data_loader, current_step):
tb.add_audio('ValSampleAudio', audio_signal, current_step, tb.add_audio('ValSampleAudio', audio_signal, current_step,
sample_rate=c.sample_rate) sample_rate=c.sample_rate)
except: except:
print("\n > Error at audio signal on TB!!") print(" | > Error at audio signal on TB!!")
print(audio_signal.max()) print(audio_signal.max())
print(audio_signal.min()) print(audio_signal.min())
@ -305,8 +305,8 @@ def main(args):
) )
train_loader = DataLoader(train_dataset, batch_size=c.batch_size, train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
shuffle=True, collate_fn=train_dataset.collate_fn, shuffle=False, collate_fn=train_dataset.collate_fn,
drop_last=True, num_workers=c.num_loader_workers, drop_last=False, num_workers=c.num_loader_workers,
pin_memory=True) pin_memory=True)
val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'), val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
@ -325,15 +325,16 @@ def main(args):
) )
val_loader = DataLoader(val_dataset, batch_size=c.batch_size, val_loader = DataLoader(val_dataset, batch_size=c.batch_size,
shuffle=True, collate_fn=val_dataset.collate_fn, shuffle=False, collate_fn=val_dataset.collate_fn,
drop_last=True, num_workers= 4, drop_last=False, num_workers= 4,
pin_memory=True) pin_memory=True)
model = Tacotron(c.embedding_size, model = Tacotron(c.embedding_size,
c.hidden_size, c.hidden_size,
c.num_mels, c.num_mels,
c.num_freq, c.num_freq,
c.r) c.r,
use_atten_mask=True)
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=c.lr)
@ -352,6 +353,7 @@ def main(args):
start_epoch = 0 start_epoch = 0
args.restore_step = checkpoint['step'] args.restore_step = checkpoint['step']
else: else:
args.restore_step = 0
print("\n > Starting a new training") print("\n > Starting a new training")
if use_cuda: if use_cuda: