mirror of https://github.com/coqui-ai/TTS.git
best model ever changes
This commit is contained in:
parent
52653124e3
commit
664d299fc4
|
@ -14,7 +14,7 @@
|
||||||
"epochs": 2000,
|
"epochs": 2000,
|
||||||
"lr": 0.0006,
|
"lr": 0.0006,
|
||||||
"warmup_steps": 4000,
|
"warmup_steps": 4000,
|
||||||
"batch_size": 180,
|
"batch_size": 32,
|
||||||
"r": 5,
|
"r": 5,
|
||||||
|
|
||||||
"griffin_lim_iters": 60,
|
"griffin_lim_iters": 60,
|
||||||
|
|
|
@ -26,6 +26,7 @@ class LJSpeechDataset(Dataset):
|
||||||
frame_length_ms, preemphasis, ref_level_db, num_freq, power)
|
frame_length_ms, preemphasis, ref_level_db, num_freq, power)
|
||||||
print(" > Reading LJSpeech from - {}".format(root_dir))
|
print(" > Reading LJSpeech from - {}".format(root_dir))
|
||||||
print(" | > Number of instances : {}".format(len(self.frames)))
|
print(" | > Number of instances : {}".format(len(self.frames)))
|
||||||
|
self._sort_frames()
|
||||||
|
|
||||||
def load_wav(self, filename):
|
def load_wav(self, filename):
|
||||||
try:
|
try:
|
||||||
|
@ -34,6 +35,20 @@ class LJSpeechDataset(Dataset):
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
print(" !! Cannot read file : {}".format(filename))
|
print(" !! Cannot read file : {}".format(filename))
|
||||||
|
|
||||||
|
def _sort_frames(self):
|
||||||
|
r"""Sort sequences in ascending order"""
|
||||||
|
lengths = np.array([len(ins[1]) for ins in self.frames])
|
||||||
|
|
||||||
|
print(" | > Max length sequence {}".format(np.max(lengths)))
|
||||||
|
print(" | > Min length sequence {}".format(np.min(lengths)))
|
||||||
|
print(" | > Avg length sequence {}".format(np.mean(lengths)))
|
||||||
|
|
||||||
|
idxs = np.argsort(lengths)
|
||||||
|
new_frames = [None] * len(lengths)
|
||||||
|
for i, idx in enumerate(idxs):
|
||||||
|
new_frames[i] = self.frames[idx]
|
||||||
|
self.frames = new_frames
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.frames)
|
return len(self.frames)
|
||||||
|
|
||||||
|
@ -47,9 +62,17 @@ class LJSpeechDataset(Dataset):
|
||||||
return sample
|
return sample
|
||||||
|
|
||||||
def get_dummy_data(self):
|
def get_dummy_data(self):
|
||||||
|
r"""Get a dummy input for testing"""
|
||||||
return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor)
|
return torch.autograd.Variable(torch.ones(16, 143)).type(torch.LongTensor)
|
||||||
|
|
||||||
def collate_fn(self, batch):
|
def collate_fn(self, batch):
|
||||||
|
r"""
|
||||||
|
Perform preprocessing and create a final data batch:
|
||||||
|
1. PAD sequences with the longest sequence in the batch
|
||||||
|
2. Convert Audio signal to Spectrograms.
|
||||||
|
3. PAD sequences that can be divided by r.
|
||||||
|
4. Convert Numpy to Torch tensors.
|
||||||
|
"""
|
||||||
|
|
||||||
# Puts each data field into a tensor with outer dimension batch size
|
# Puts each data field into a tensor with outer dimension batch size
|
||||||
if isinstance(batch[0], collections.Mapping):
|
if isinstance(batch[0], collections.Mapping):
|
||||||
|
|
|
@ -5,26 +5,27 @@ from torch.nn import functional as F
|
||||||
|
|
||||||
|
|
||||||
class BahdanauAttention(nn.Module):
|
class BahdanauAttention(nn.Module):
|
||||||
def __init__(self, dim):
|
def __init__(self, annot_dim, query_dim, hidden_dim):
|
||||||
super(BahdanauAttention, self).__init__()
|
super(BahdanauAttention, self).__init__()
|
||||||
self.query_layer = nn.Linear(dim, dim, bias=False)
|
self.query_layer = nn.Linear(query_dim, hidden_dim, bias=True)
|
||||||
self.tanh = nn.Tanh()
|
self.annot_layer = nn.Linear(annot_dim, hidden_dim, bias=True)
|
||||||
self.v = nn.Linear(dim, 1, bias=False)
|
self.v = nn.Linear(hidden_dim, 1, bias=False)
|
||||||
|
|
||||||
def forward(self, query, processed_inputs):
|
def forward(self, annots, query):
|
||||||
"""
|
"""
|
||||||
Args:
|
Shapes:
|
||||||
query: (batch, 1, dim) or (batch, dim)
|
- query: (batch, 1, dim) or (batch, dim)
|
||||||
processed_inputs: (batch, max_time, dim)
|
- annots: (batch, max_time, dim)
|
||||||
"""
|
"""
|
||||||
if query.dim() == 2:
|
if query.dim() == 2:
|
||||||
# insert time-axis for broadcasting
|
# insert time-axis for broadcasting
|
||||||
query = query.unsqueeze(1)
|
query = query.unsqueeze(1)
|
||||||
# (batch, 1, dim)
|
# (batch, 1, dim)
|
||||||
processed_query = self.query_layer(query)
|
processed_query = self.query_layer(query)
|
||||||
|
processed_annots = self.annot_layer(annots)
|
||||||
|
|
||||||
# (batch, max_time, 1)
|
# (batch, max_time, 1)
|
||||||
alignment = self.v(self.tanh(processed_query + processed_inputs))
|
alignment = self.v(nn.functional.tanh(processed_query + processed_annots))
|
||||||
|
|
||||||
# (batch, max_time)
|
# (batch, max_time)
|
||||||
return alignment.squeeze(-1)
|
return alignment.squeeze(-1)
|
||||||
|
@ -34,7 +35,7 @@ def get_mask_from_lengths(inputs, inputs_lengths):
|
||||||
"""Get mask tensor from list of length
|
"""Get mask tensor from list of length
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
inputs: (batch, max_time, dim)
|
inputs: Tensor in size (batch, max_time, dim)
|
||||||
inputs_lengths: array like
|
inputs_lengths: array like
|
||||||
"""
|
"""
|
||||||
mask = inputs.data.new(inputs.size(0), inputs.size(1)).byte().zero_()
|
mask = inputs.data.new(inputs.size(0), inputs.size(1)).byte().zero_()
|
||||||
|
@ -43,52 +44,48 @@ def get_mask_from_lengths(inputs, inputs_lengths):
|
||||||
return ~mask
|
return ~mask
|
||||||
|
|
||||||
|
|
||||||
class AttentionWrapper(nn.Module):
|
class AttentionRNN(nn.Module):
|
||||||
def __init__(self, rnn_cell, alignment_model,
|
def __init__(self, out_dim, annot_dim, memory_dim,
|
||||||
score_mask_value=-float("inf")):
|
score_mask_value=-float("inf")):
|
||||||
super(AttentionWrapper, self).__init__()
|
super(AttentionRNN, self).__init__()
|
||||||
self.rnn_cell = rnn_cell
|
self.rnn_cell = nn.GRUCell(annot_dim + memory_dim, out_dim)
|
||||||
self.alignment_model = alignment_model
|
self.alignment_model = BahdanauAttention(annot_dim, out_dim, out_dim)
|
||||||
self.score_mask_value = score_mask_value
|
self.score_mask_value = score_mask_value
|
||||||
|
|
||||||
def forward(self, query, context_vec, cell_state, inputs,
|
def forward(self, memory, context, rnn_state, annotations,
|
||||||
processed_inputs=None, mask=None, inputs_lengths=None):
|
mask=None, annotations_lengths=None):
|
||||||
|
|
||||||
if processed_inputs is None:
|
if annotations_lengths is not None and mask is None:
|
||||||
processed_inputs = inputs
|
mask = get_mask_from_lengths(annotations, annotations_lengths)
|
||||||
|
|
||||||
if inputs_lengths is not None and mask is None:
|
|
||||||
mask = get_mask_from_lengths(inputs, inputs_lengths)
|
|
||||||
|
|
||||||
# Alignment
|
# Alignment
|
||||||
# (batch, max_time)
|
# (batch, max_time)
|
||||||
# e_{ij} = a(s_{i-1}, h_j)
|
# e_{ij} = a(s_{i-1}, h_j)
|
||||||
# import ipdb
|
alignment = self.alignment_model(annotations, rnn_state)
|
||||||
# ipdb.set_trace()
|
|
||||||
alignment = self.alignment_model(cell_state, processed_inputs)
|
|
||||||
|
|
||||||
|
# TODO: needs recheck.
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
mask = mask.view(query.size(0), -1)
|
mask = mask.view(query.size(0), -1)
|
||||||
alignment.data.masked_fill_(mask, self.score_mask_value)
|
alignment.data.masked_fill_(mask, self.score_mask_value)
|
||||||
|
|
||||||
# Normalize context_vec weight
|
# Normalize context weight
|
||||||
alignment = F.softmax(alignment, dim=-1)
|
alignment = F.softmax(alignment, dim=-1)
|
||||||
|
|
||||||
# Attention context vector
|
# Attention context vector
|
||||||
# (batch, 1, dim)
|
# (batch, 1, dim)
|
||||||
# c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j
|
# c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j
|
||||||
context_vec = torch.bmm(alignment.unsqueeze(1), inputs)
|
context = torch.bmm(alignment.unsqueeze(1), annotations)
|
||||||
context_vec = context_vec.squeeze(1)
|
context = context.squeeze(1)
|
||||||
|
|
||||||
# Concat input query and previous context_vec context
|
# Concat input query and previous context context
|
||||||
cell_input = torch.cat((query, context_vec), -1)
|
rnn_input = torch.cat((memory, context), -1)
|
||||||
#cell_input = cell_input.unsqueeze(1)
|
#rnn_input = rnn_input.unsqueeze(1)
|
||||||
|
|
||||||
# Feed it to RNN
|
# Feed it to RNN
|
||||||
# s_i = f(y_{i-1}, c_{i}, s_{i-1})
|
# s_i = f(y_{i-1}, c_{i}, s_{i-1})
|
||||||
cell_output = self.rnn_cell(cell_input, cell_state)
|
rnn_output = self.rnn_cell(rnn_input, rnn_state)
|
||||||
|
|
||||||
context_vec = context_vec.squeeze(1)
|
context = context.squeeze(1)
|
||||||
return cell_output, context_vec, alignment
|
return rnn_output, context, alignment
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ import torch
|
||||||
from torch.autograd import Variable
|
from torch.autograd import Variable
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from .attention import BahdanauAttention, AttentionWrapper
|
from .attention import AttentionRNN
|
||||||
from .attention import get_mask_from_lengths
|
from .attention import get_mask_from_lengths
|
||||||
|
|
||||||
class Prenet(nn.Module):
|
class Prenet(nn.Module):
|
||||||
|
@ -219,15 +219,10 @@ class Decoder(nn.Module):
|
||||||
self.memory_dim = memory_dim
|
self.memory_dim = memory_dim
|
||||||
self.eps = eps
|
self.eps = eps
|
||||||
self.r = r
|
self.r = r
|
||||||
# input -> |Linear| -> processed_inputs
|
|
||||||
self.input_layer = nn.Linear(in_features, 256, bias=False)
|
|
||||||
# memory -> |Prenet| -> processed_memory
|
# memory -> |Prenet| -> processed_memory
|
||||||
self.prenet = Prenet(memory_dim * r, out_features=[256, 128])
|
self.prenet = Prenet(memory_dim * r, out_features=[256, 128])
|
||||||
# processed_inputs, processed_memory -> |Attention| -> Attention, Alignment, RNN_State
|
# processed_inputs, processed_memory -> |Attention| -> Attention, Alignment, RNN_State
|
||||||
self.attention_rnn = AttentionWrapper(
|
self.attention_rnn = AttentionRNN(256, in_features, 128)
|
||||||
nn.GRUCell(in_features + 128, 256),
|
|
||||||
BahdanauAttention(256)
|
|
||||||
)
|
|
||||||
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
|
# (processed_memory | attention context) -> |Linear| -> decoder_RNN_input
|
||||||
self.project_to_decoder_in = nn.Linear(256+in_features, 256)
|
self.project_to_decoder_in = nn.Linear(256+in_features, 256)
|
||||||
# decoder_RNN_input -> |RNN| -> RNN_state
|
# decoder_RNN_input -> |RNN| -> RNN_state
|
||||||
|
@ -245,9 +240,9 @@ class Decoder(nn.Module):
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
inputs: Encoder outputs.
|
inputs: Encoder outputs.
|
||||||
memory: Decoder memory (autoregression. If None (at eval-time),
|
memory (None): Decoder memory (autoregression. If None (at eval-time),
|
||||||
decoder outputs are used as decoder inputs.
|
decoder outputs are used as decoder inputs.
|
||||||
input_lengths: Encoder output (memory) lengths. If not None, used for
|
input_lengths (None): input lengths, used for
|
||||||
attention masking.
|
attention masking.
|
||||||
|
|
||||||
Shapes:
|
Shapes:
|
||||||
|
@ -256,12 +251,11 @@ class Decoder(nn.Module):
|
||||||
"""
|
"""
|
||||||
B = inputs.size(0)
|
B = inputs.size(0)
|
||||||
|
|
||||||
# TODO: take this segment into Attention module.
|
|
||||||
processed_inputs = self.input_layer(inputs)
|
# if input_lengths is not None:
|
||||||
if input_lengths is not None:
|
# mask = get_mask_from_lengths(processed_inputs, input_lengths)
|
||||||
mask = get_mask_from_lengths(processed_inputs, input_lengths)
|
# else:
|
||||||
else:
|
# mask = None
|
||||||
mask = None
|
|
||||||
|
|
||||||
# Run greedy decoding if memory is None
|
# Run greedy decoding if memory is None
|
||||||
greedy = memory is None
|
greedy = memory is None
|
||||||
|
@ -300,20 +294,7 @@ class Decoder(nn.Module):
|
||||||
memory_input = initial_memory
|
memory_input = initial_memory
|
||||||
while True:
|
while True:
|
||||||
if t > 0:
|
if t > 0:
|
||||||
# using harmonized teacher-forcing.
|
memory_input = outputs[-1] if greedy else memory[t - 1]
|
||||||
# from https://arxiv.org/abs/1707.06588
|
|
||||||
if greedy:
|
|
||||||
memory_input = outputs[-1]
|
|
||||||
else:
|
|
||||||
# combine prev. model output and prev. real target
|
|
||||||
memory_input = torch.div(outputs[-1] + memory[t-1], 2.0)
|
|
||||||
memory_input = torch.nn.functional.dropout(memory_input,
|
|
||||||
0.1,
|
|
||||||
training=True)
|
|
||||||
# add a random noise
|
|
||||||
noise = torch.autograd.Variable(
|
|
||||||
memory_input.data.new(memory_input.size()).normal_(0.0, 1.0))
|
|
||||||
memory_input = memory_input + noise
|
|
||||||
|
|
||||||
# Prenet
|
# Prenet
|
||||||
processed_memory = self.prenet(memory_input)
|
processed_memory = self.prenet(memory_input)
|
||||||
|
@ -321,7 +302,7 @@ class Decoder(nn.Module):
|
||||||
# Attention RNN
|
# Attention RNN
|
||||||
attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn(
|
attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn(
|
||||||
processed_memory, current_context_vec, attention_rnn_hidden,
|
processed_memory, current_context_vec, attention_rnn_hidden,
|
||||||
inputs, processed_inputs=processed_inputs, mask=mask)
|
inputs)
|
||||||
|
|
||||||
# Concat RNN output and attention context vector
|
# Concat RNN output and attention context vector
|
||||||
decoder_input = self.project_to_decoder_in(
|
decoder_input = self.project_to_decoder_in(
|
||||||
|
|
|
@ -9,11 +9,11 @@ from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
|
||||||
class Tacotron(nn.Module):
|
class Tacotron(nn.Module):
|
||||||
def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80,
|
def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80,
|
||||||
freq_dim=1025, r=5, padding_idx=None,
|
freq_dim=1025, r=5, padding_idx=None,
|
||||||
use_memory_mask=False):
|
use_atten_mask=False):
|
||||||
super(Tacotron, self).__init__()
|
super(Tacotron, self).__init__()
|
||||||
self.mel_dim = mel_dim
|
self.mel_dim = mel_dim
|
||||||
self.linear_dim = linear_dim
|
self.linear_dim = linear_dim
|
||||||
self.use_memory_mask = use_memory_mask
|
self.use_atten_mask = use_atten_mask
|
||||||
self.embedding = nn.Embedding(len(symbols), embedding_dim,
|
self.embedding = nn.Embedding(len(symbols), embedding_dim,
|
||||||
padding_idx=padding_idx)
|
padding_idx=padding_idx)
|
||||||
print(" | > Embedding dim : {}".format(len(symbols)))
|
print(" | > Embedding dim : {}".format(len(symbols)))
|
||||||
|
@ -33,9 +33,7 @@ class Tacotron(nn.Module):
|
||||||
# (B, T', in_dim)
|
# (B, T', in_dim)
|
||||||
encoder_outputs = self.encoder(inputs)
|
encoder_outputs = self.encoder(inputs)
|
||||||
|
|
||||||
if self.use_memory_mask:
|
if not self.use_atten_mask:
|
||||||
input_lengths = input_lengths
|
|
||||||
else:
|
|
||||||
input_lengths = None
|
input_lengths = None
|
||||||
|
|
||||||
# (B, T', mel_dim*r)
|
# (B, T', mel_dim*r)
|
||||||
|
|
20
train.py
20
train.py
|
@ -199,7 +199,7 @@ def evaluate(model, criterion, data_loader, current_step):
|
||||||
model = model.train()
|
model = model.train()
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
|
|
||||||
print("\n | > Validation")
|
print(" | > Validation")
|
||||||
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
||||||
progbar = Progbar(len(data_loader.dataset) / c.batch_size)
|
progbar = Progbar(len(data_loader.dataset) / c.batch_size)
|
||||||
|
|
||||||
|
@ -246,10 +246,10 @@ def evaluate(model, criterion, data_loader, current_step):
|
||||||
('mel_loss', mel_loss.data[0])])
|
('mel_loss', mel_loss.data[0])])
|
||||||
|
|
||||||
avg_linear_loss += linear_loss.data[0]
|
avg_linear_loss += linear_loss.data[0]
|
||||||
avg_mel_loss += avg_mel_loss.data[0]
|
avg_mel_loss += mel_loss.data[0]
|
||||||
|
|
||||||
# Diagnostic visualizations
|
# Diagnostic visualizations
|
||||||
idx = np.random.randint(c.batch_size)
|
idx = np.random.randint(mel_input.shape[0])
|
||||||
const_spec = linear_output[idx].data.cpu().numpy()
|
const_spec = linear_output[idx].data.cpu().numpy()
|
||||||
gt_spec = linear_spec_var[idx].data.cpu().numpy()
|
gt_spec = linear_spec_var[idx].data.cpu().numpy()
|
||||||
align_img = alignments[idx].data.cpu().numpy()
|
align_img = alignments[idx].data.cpu().numpy()
|
||||||
|
@ -270,7 +270,7 @@ def evaluate(model, criterion, data_loader, current_step):
|
||||||
tb.add_audio('ValSampleAudio', audio_signal, current_step,
|
tb.add_audio('ValSampleAudio', audio_signal, current_step,
|
||||||
sample_rate=c.sample_rate)
|
sample_rate=c.sample_rate)
|
||||||
except:
|
except:
|
||||||
print("\n > Error at audio signal on TB!!")
|
print(" | > Error at audio signal on TB!!")
|
||||||
print(audio_signal.max())
|
print(audio_signal.max())
|
||||||
print(audio_signal.min())
|
print(audio_signal.min())
|
||||||
|
|
||||||
|
@ -305,8 +305,8 @@ def main(args):
|
||||||
)
|
)
|
||||||
|
|
||||||
train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
|
train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
|
||||||
shuffle=True, collate_fn=train_dataset.collate_fn,
|
shuffle=False, collate_fn=train_dataset.collate_fn,
|
||||||
drop_last=True, num_workers=c.num_loader_workers,
|
drop_last=False, num_workers=c.num_loader_workers,
|
||||||
pin_memory=True)
|
pin_memory=True)
|
||||||
|
|
||||||
val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
|
val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
|
||||||
|
@ -325,15 +325,16 @@ def main(args):
|
||||||
)
|
)
|
||||||
|
|
||||||
val_loader = DataLoader(val_dataset, batch_size=c.batch_size,
|
val_loader = DataLoader(val_dataset, batch_size=c.batch_size,
|
||||||
shuffle=True, collate_fn=val_dataset.collate_fn,
|
shuffle=False, collate_fn=val_dataset.collate_fn,
|
||||||
drop_last=True, num_workers= 4,
|
drop_last=False, num_workers= 4,
|
||||||
pin_memory=True)
|
pin_memory=True)
|
||||||
|
|
||||||
model = Tacotron(c.embedding_size,
|
model = Tacotron(c.embedding_size,
|
||||||
c.hidden_size,
|
c.hidden_size,
|
||||||
c.num_mels,
|
c.num_mels,
|
||||||
c.num_freq,
|
c.num_freq,
|
||||||
c.r)
|
c.r,
|
||||||
|
use_atten_mask=True)
|
||||||
|
|
||||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||||
|
|
||||||
|
@ -352,6 +353,7 @@ def main(args):
|
||||||
start_epoch = 0
|
start_epoch = 0
|
||||||
args.restore_step = checkpoint['step']
|
args.restore_step = checkpoint['step']
|
||||||
else:
|
else:
|
||||||
|
args.restore_step = 0
|
||||||
print("\n > Starting a new training")
|
print("\n > Starting a new training")
|
||||||
|
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
|
|
Loading…
Reference in New Issue