mirror of https://github.com/coqui-ai/TTS.git
best model ever changes
This commit is contained in:
parent
405fbc434e
commit
b4032e8dff
|
@ -14,7 +14,7 @@
|
|||
"epochs": 2000,
|
||||
"lr": 0.0006,
|
||||
"warmup_steps": 4000,
|
||||
"batch_size": 180,
|
||||
"batch_size": 32,
|
||||
"r": 5,
|
||||
|
||||
"griffin_lim_iters": 60,
|
||||
|
|
|
@ -26,6 +26,7 @@ class LJSpeechDataset(Dataset):
|
|||
frame_length_ms, preemphasis, ref_level_db, num_freq, power)
|
||||
print(" > Reading LJSpeech from - {}".format(root_dir))
|
||||
print(" | > Number of instances : {}".format(len(self.frames)))
|
||||
self._sort_frames()
|
||||
|
||||
def load_wav(self, filename):
|
||||
try:
|
||||
|
@ -34,6 +35,20 @@ class LJSpeechDataset(Dataset):
|
|||
except RuntimeError as e:
|
||||
print(" !! Cannot read file : {}".format(filename))
|
||||
|
||||
def _sort_frames(self):
|
||||
r"""Sort sequences in ascending order"""
|
||||
lengths = np.array([len(ins[1]) for ins in self.frames])
|
||||
|
||||
print(" | > Max length sequence {}".format(np.max(lengths)))
|
||||
print(" | > Min length sequence {}".format(np.min(lengths)))
|
||||
print(" | > Avg length sequence {}".format(np.mean(lengths)))
|
||||
|
||||
idxs = np.argsort(lengths)
|
||||
new_frames = [None] * len(lengths)
|
||||
for i, idx in enumerate(idxs):
|
||||
new_frames[i] = self.frames[idx]
|
||||
self.frames = new_frames
|
||||
|
||||
def __len__(self):
|
||||
return len(self.frames)
|
||||
|
||||
|
@ -47,9 +62,17 @@ class LJSpeechDataset(Dataset):
|
|||
return sample
|
||||
|
||||
def get_dummy_data(self):
|
||||
r"""Get a dummy input for testing"""
|
||||
return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor)
|
||||
|
||||
def collate_fn(self, batch):
|
||||
r"""
|
||||
Perform preprocessing and create a final data batch:
|
||||
1. PAD sequences with the longest sequence in the batch
|
||||
2. Convert Audio signal to Spectrograms.
|
||||
3. PAD sequences that can be divided by r.
|
||||
4. Convert Numpy to Torch tensors.
|
||||
"""
|
||||
|
||||
# Puts each data field into a tensor with outer dimension batch size
|
||||
if isinstance(batch[0], collections.Mapping):
|
||||
|
|
|
@ -5,26 +5,27 @@ from torch.nn import functional as F
|
|||
|
||||
|
||||
class BahdanauAttention(nn.Module):
|
||||
def __init__(self, dim):
|
||||
def __init__(self, annot_dim, query_dim, hidden_dim):
|
||||
super(BahdanauAttention, self).__init__()
|
||||
self.query_layer = nn.Linear(dim, dim, bias=False)
|
||||
self.tanh = nn.Tanh()
|
||||
self.v = nn.Linear(dim, 1, bias=False)
|
||||
self.query_layer = nn.Linear(query_dim, hidden_dim, bias=True)
|
||||
self.annot_layer = nn.Linear(annot_dim, hidden_dim, bias=True)
|
||||
self.v = nn.Linear(hidden_dim, 1, bias=False)
|
||||
|
||||
def forward(self, query, processed_inputs):
|
||||
def forward(self, annots, query):
|
||||
"""
|
||||
Args:
|
||||
query: (batch, 1, dim) or (batch, dim)
|
||||
processed_inputs: (batch, max_time, dim)
|
||||
Shapes:
|
||||
- query: (batch, 1, dim) or (batch, dim)
|
||||
- annots: (batch, max_time, dim)
|
||||
"""
|
||||
if query.dim() == 2:
|
||||
# insert time-axis for broadcasting
|
||||
query = query.unsqueeze(1)
|
||||
# (batch, 1, dim)
|
||||
processed_query = self.query_layer(query)
|
||||
processed_annots = self.annot_layer(annots)
|
||||
|
||||
# (batch, max_time, 1)
|
||||
alignment = self.v(self.tanh(processed_query + processed_inputs))
|
||||
alignment = self.v(nn.functional.tanh(processed_query + processed_annots))
|
||||
|
||||
# (batch, max_time)
|
||||
return alignment.squeeze(-1)
|
||||
|
@ -34,7 +35,7 @@ def get_mask_from_lengths(inputs, inputs_lengths):
|
|||
"""Get mask tensor from list of length
|
||||
|
||||
Args:
|
||||
inputs: (batch, max_time, dim)
|
||||
inputs: Tensor in size (batch, max_time, dim)
|
||||
inputs_lengths: array like
|
||||
"""
|
||||
mask = inputs.data.new(inputs.size(0), inputs.size(1)).byte().zero_()
|
||||
|
@ -43,52 +44,48 @@ def get_mask_from_lengths(inputs, inputs_lengths):
|
|||
return ~mask
|
||||
|
||||
|
||||
class AttentionWrapper(nn.Module):
|
||||
def __init__(self, rnn_cell, alignment_model,
|
||||
class AttentionRNN(nn.Module):
|
||||
def __init__(self, out_dim, annot_dim, memory_dim,
|
||||
score_mask_value=-float("inf")):
|
||||
super(AttentionWrapper, self).__init__()
|
||||
self.rnn_cell = rnn_cell
|
||||
self.alignment_model = alignment_model
|
||||
super(AttentionRNN, self).__init__()
|
||||
self.rnn_cell = nn.GRUCell(annot_dim + memory_dim, out_dim)
|
||||
self.alignment_model = BahdanauAttention(annot_dim, out_dim, out_dim)
|
||||
self.score_mask_value = score_mask_value
|
||||
|
||||
def forward(self, query, context_vec, cell_state, inputs,
|
||||
processed_inputs=None, mask=None, inputs_lengths=None):
|
||||
def forward(self, memory, context, rnn_state, annotations,
|
||||
mask=None, annotations_lengths=None):
|
||||
|
||||
if processed_inputs is None:
|
||||
processed_inputs = inputs
|
||||
|
||||
if inputs_lengths is not None and mask is None:
|
||||
mask = get_mask_from_lengths(inputs, inputs_lengths)
|
||||
if annotations_lengths is not None and mask is None:
|
||||
mask = get_mask_from_lengths(annotations, annotations_lengths)
|
||||
|
||||
# Alignment
|
||||
# (batch, max_time)
|
||||
# e_{ij} = a(s_{i-1}, h_j)
|
||||
# import ipdb
|
||||
# ipdb.set_trace()
|
||||
alignment = self.alignment_model(cell_state, processed_inputs)
|
||||
alignment = self.alignment_model(annotations, rnn_state)
|
||||
|
||||
# TODO: needs recheck.
|
||||
if mask is not None:
|
||||
mask = mask.view(query.size(0), -1)
|
||||
alignment.data.masked_fill_(mask, self.score_mask_value)
|
||||
|
||||
# Normalize context_vec weight
|
||||
# Normalize context weight
|
||||
alignment = F.softmax(alignment, dim=-1)
|
||||
|
||||
# Attention context vector
|
||||
# (batch, 1, dim)
|
||||
# c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j
|
||||
context_vec = torch.bmm(alignment.unsqueeze(1), inputs)
|
||||
context_vec = context_vec.squeeze(1)
|
||||
context = torch.bmm(alignment.unsqueeze(1), annotations)
|
||||
context = context.squeeze(1)
|
||||
|
||||
# Concat input query and previous context_vec context
|
||||
cell_input = torch.cat((query, context_vec), -1)
|
||||
#cell_input = cell_input.unsqueeze(1)
|
||||
# Concat input query and previous context context
|
||||
rnn_input = torch.cat((memory, context), -1)
|
||||
#rnn_input = rnn_input.unsqueeze(1)
|
||||
|
||||
# Feed it to RNN
|
||||
# s_i = f(y_{i-1}, c_{i}, s_{i-1})
|
||||
cell_output = self.rnn_cell(cell_input, cell_state)
|
||||
rnn_output = self.rnn_cell(rnn_input, rnn_state)
|
||||
|
||||
context_vec = context_vec.squeeze(1)
|
||||
return cell_output, context_vec, alignment
|
||||
context = context.squeeze(1)
|
||||
return rnn_output, context, alignment
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import torch
|
|||
from torch.autograd import Variable
|
||||
from torch import nn
|
||||
|
||||
from .attention import BahdanauAttention, AttentionWrapper
|
||||
from .attention import AttentionRNN
|
||||
from .attention import get_mask_from_lengths
|
||||
|
||||
class Prenet(nn.Module):
|
||||
|
@ -219,15 +219,10 @@ class Decoder(nn.Module):
|
|||
self.memory_dim = memory_dim
|
||||
self.eps = eps
|
||||
self.r = r
|
||||
# input -> |Linear| -> processed_inputs
|
||||
self.input_layer = nn.Linear(in_features, 256, bias=False)
|
||||
# memory -> |Prenet| -> processed_memory
|
||||
self.prenet = Prenet(memory_dim * r, out_features=[256, 128])
|
||||
# processed_inputs, processed_memory -> |Attention| -> Attention, Alignment, RNN_State
|
||||
self.attention_rnn = AttentionWrapper(
|
||||
nn.GRUCell(in_features + 128, 256),
|
||||
BahdanauAttention(256)
|
||||
)
|
||||
self.attention_rnn = AttentionRNN(256, in_features, 128)
|
||||
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
|
||||
self.project_to_decoder_in = nn.Linear(256+in_features, 256)
|
||||
# decoder_RNN_input -> |RNN| -> RNN_state
|
||||
|
@ -245,9 +240,9 @@ class Decoder(nn.Module):
|
|||
|
||||
Args:
|
||||
inputs: Encoder outputs.
|
||||
memory: Decoder memory (autoregression. If None (at eval-time),
|
||||
memory (None): Decoder memory (autoregression. If None (at eval-time),
|
||||
decoder outputs are used as decoder inputs.
|
||||
input_lengths: Encoder output (memory) lengths. If not None, used for
|
||||
input_lengths (None): input lengths, used for
|
||||
attention masking.
|
||||
|
||||
Shapes:
|
||||
|
@ -256,12 +251,11 @@ class Decoder(nn.Module):
|
|||
"""
|
||||
B = inputs.size(0)
|
||||
|
||||
# TODO: take this segment into Attention module.
|
||||
processed_inputs = self.input_layer(inputs)
|
||||
if input_lengths is not None:
|
||||
mask = get_mask_from_lengths(processed_inputs, input_lengths)
|
||||
else:
|
||||
mask = None
|
||||
|
||||
# if input_lengths is not None:
|
||||
# mask = get_mask_from_lengths(processed_inputs, input_lengths)
|
||||
# else:
|
||||
# mask = None
|
||||
|
||||
# Run greedy decoding if memory is None
|
||||
greedy = memory is None
|
||||
|
@ -300,20 +294,7 @@ class Decoder(nn.Module):
|
|||
memory_input = initial_memory
|
||||
while True:
|
||||
if t > 0:
|
||||
# using harmonized teacher-forcing.
|
||||
# from https://arxiv.org/abs/1707.06588
|
||||
if greedy:
|
||||
memory_input = outputs[-1]
|
||||
else:
|
||||
# combine prev. model output and prev. real target
|
||||
memory_input = torch.div(outputs[-1] + memory[t-1], 2.0)
|
||||
memory_input = torch.nn.functional.dropout(memory_input,
|
||||
0.1,
|
||||
training=True)
|
||||
# add a random noise
|
||||
noise = torch.autograd.Variable(
|
||||
memory_input.data.new(memory_input.size()).normal_(0.0, 1.0))
|
||||
memory_input = memory_input + noise
|
||||
memory_input = outputs[-1] if greedy else memory[t - 1]
|
||||
|
||||
# Prenet
|
||||
processed_memory = self.prenet(memory_input)
|
||||
|
@ -321,7 +302,7 @@ class Decoder(nn.Module):
|
|||
# Attention RNN
|
||||
attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn(
|
||||
processed_memory, current_context_vec, attention_rnn_hidden,
|
||||
inputs, processed_inputs=processed_inputs, mask=mask)
|
||||
inputs)
|
||||
|
||||
# Concat RNN output and attention context vector
|
||||
decoder_input = self.project_to_decoder_in(
|
||||
|
|
|
@ -9,11 +9,11 @@ from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
|
|||
class Tacotron(nn.Module):
|
||||
def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80,
|
||||
freq_dim=1025, r=5, padding_idx=None,
|
||||
use_memory_mask=False):
|
||||
use_atten_mask=False):
|
||||
super(Tacotron, self).__init__()
|
||||
self.mel_dim = mel_dim
|
||||
self.linear_dim = linear_dim
|
||||
self.use_memory_mask = use_memory_mask
|
||||
self.use_atten_mask = use_atten_mask
|
||||
self.embedding = nn.Embedding(len(symbols), embedding_dim,
|
||||
padding_idx=padding_idx)
|
||||
print(" | > Embedding dim : {}".format(len(symbols)))
|
||||
|
@ -33,9 +33,7 @@ class Tacotron(nn.Module):
|
|||
# (B, T', in_dim)
|
||||
encoder_outputs = self.encoder(inputs)
|
||||
|
||||
if self.use_memory_mask:
|
||||
input_lengths = input_lengths
|
||||
else:
|
||||
if not self.use_atten_mask:
|
||||
input_lengths = None
|
||||
|
||||
# (B, T', mel_dim*r)
|
||||
|
|
20
train.py
20
train.py
|
@ -199,7 +199,7 @@ def evaluate(model, criterion, data_loader, current_step):
|
|||
model = model.train()
|
||||
epoch_time = 0
|
||||
|
||||
print("\n | > Validation")
|
||||
print(" | > Validation")
|
||||
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
||||
progbar = Progbar(len(data_loader.dataset) / c.batch_size)
|
||||
|
||||
|
@ -246,10 +246,10 @@ def evaluate(model, criterion, data_loader, current_step):
|
|||
('mel_loss', mel_loss.data[0])])
|
||||
|
||||
avg_linear_loss += linear_loss.data[0]
|
||||
avg_mel_loss += avg_mel_loss.data[0]
|
||||
avg_mel_loss += mel_loss.data[0]
|
||||
|
||||
# Diagnostic visualizations
|
||||
idx = np.random.randint(c.batch_size)
|
||||
idx = np.random.randint(mel_input.shape[0])
|
||||
const_spec = linear_output[idx].data.cpu().numpy()
|
||||
gt_spec = linear_spec_var[idx].data.cpu().numpy()
|
||||
align_img = alignments[idx].data.cpu().numpy()
|
||||
|
@ -270,7 +270,7 @@ def evaluate(model, criterion, data_loader, current_step):
|
|||
tb.add_audio('ValSampleAudio', audio_signal, current_step,
|
||||
sample_rate=c.sample_rate)
|
||||
except:
|
||||
print("\n > Error at audio signal on TB!!")
|
||||
print(" | > Error at audio signal on TB!!")
|
||||
print(audio_signal.max())
|
||||
print(audio_signal.min())
|
||||
|
||||
|
@ -305,8 +305,8 @@ def main(args):
|
|||
)
|
||||
|
||||
train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
|
||||
shuffle=True, collate_fn=train_dataset.collate_fn,
|
||||
drop_last=True, num_workers=c.num_loader_workers,
|
||||
shuffle=False, collate_fn=train_dataset.collate_fn,
|
||||
drop_last=False, num_workers=c.num_loader_workers,
|
||||
pin_memory=True)
|
||||
|
||||
val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
|
||||
|
@ -325,15 +325,16 @@ def main(args):
|
|||
)
|
||||
|
||||
val_loader = DataLoader(val_dataset, batch_size=c.batch_size,
|
||||
shuffle=True, collate_fn=val_dataset.collate_fn,
|
||||
drop_last=True, num_workers= 4,
|
||||
shuffle=False, collate_fn=val_dataset.collate_fn,
|
||||
drop_last=False, num_workers= 4,
|
||||
pin_memory=True)
|
||||
|
||||
model = Tacotron(c.embedding_size,
|
||||
c.hidden_size,
|
||||
c.num_mels,
|
||||
c.num_freq,
|
||||
c.r)
|
||||
c.r,
|
||||
use_atten_mask=True)
|
||||
|
||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||
|
||||
|
@ -352,6 +353,7 @@ def main(args):
|
|||
start_epoch = 0
|
||||
args.restore_step = checkpoint['step']
|
||||
else:
|
||||
args.restore_step = 0
|
||||
print("\n > Starting a new training")
|
||||
|
||||
if use_cuda:
|
||||
|
|
Loading…
Reference in New Issue