mirror of https://github.com/coqui-ai/TTS.git
Data loader bug fix and Attention bug fix
This commit is contained in:
parent
632c08a638
commit
3c084177c6
|
@ -12,7 +12,7 @@
|
|||
"text_cleaner": "english_cleaners",
|
||||
|
||||
"epochs": 2000,
|
||||
"lr": 0.0003,
|
||||
"lr": 0.001,
|
||||
"warmup_steps": 4000,
|
||||
"batch_size": 32,
|
||||
"eval_batch_size":32,
|
||||
|
|
|
@ -7,7 +7,7 @@ from torch.utils.data import Dataset
|
|||
|
||||
from TTS.utils.text import text_to_sequence
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.data import (prepare_data, pad_data, pad_per_step,
|
||||
from TTS.utils.data import (prepare_data, pad_per_step,
|
||||
prepare_tensor, prepare_stop_target)
|
||||
|
||||
|
||||
|
@ -96,10 +96,10 @@ class LJSpeechDataset(Dataset):
|
|||
|
||||
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
|
||||
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
|
||||
mel_lengths = [m.shape[1] for m in mel]
|
||||
mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame
|
||||
|
||||
# compute 'stop token' targets
|
||||
stop_targets = [np.array([0.]*mel_len) for mel_len in mel_lengths]
|
||||
stop_targets = [np.array([0.]*(mel_len-1)) for mel_len in mel_lengths]
|
||||
|
||||
# PAD stop targets
|
||||
stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step)
|
||||
|
@ -108,25 +108,12 @@ class LJSpeechDataset(Dataset):
|
|||
text = prepare_data(text).astype(np.int32)
|
||||
wav = prepare_data(wav)
|
||||
|
||||
# PAD features with largest length of the batch
|
||||
linear = prepare_tensor(linear)
|
||||
mel = prepare_tensor(mel)
|
||||
# PAD features with largest length + a zero frame
|
||||
linear = prepare_tensor(linear, self.outputs_per_step)
|
||||
mel = prepare_tensor(mel, self.outputs_per_step)
|
||||
assert mel.shape[2] == linear.shape[2]
|
||||
timesteps = mel.shape[2]
|
||||
|
||||
# PAD with zeros that can be divided by outputs per step
|
||||
if (timesteps + 1) % self.outputs_per_step != 0:
|
||||
pad_len = self.outputs_per_step - \
|
||||
((timesteps + 1) % self.outputs_per_step)
|
||||
pad_len += 1
|
||||
else:
|
||||
pad_len = 1
|
||||
linear = pad_per_step(linear, pad_len)
|
||||
mel = pad_per_step(mel, pad_len)
|
||||
|
||||
# update mel lengths
|
||||
mel_lengths = [l+pad_len for l in mel_lengths]
|
||||
|
||||
# B x T x D
|
||||
linear = linear.transpose(0, 2, 1)
|
||||
mel = mel.transpose(0, 2, 1)
|
||||
|
|
|
@ -48,7 +48,7 @@ class AttentionRNN(nn.Module):
|
|||
def __init__(self, out_dim, annot_dim, memory_dim,
|
||||
score_mask_value=-float("inf")):
|
||||
super(AttentionRNN, self).__init__()
|
||||
self.rnn_cell = nn.GRUCell(annot_dim + memory_dim, out_dim)
|
||||
self.rnn_cell = nn.GRUCell(out_dim + memory_dim, out_dim)
|
||||
self.alignment_model = BahdanauAttention(annot_dim, out_dim, out_dim)
|
||||
self.score_mask_value = score_mask_value
|
||||
|
||||
|
|
|
@ -304,8 +304,7 @@ class Decoder(nn.Module):
|
|||
|
||||
# Attention RNN
|
||||
attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn(
|
||||
processed_memory, current_context_vec, attention_rnn_hidden,
|
||||
inputs)
|
||||
processed_memory, current_context_vec, attention_rnn_hidden, inputs)
|
||||
|
||||
# Concat RNN output and attention context vector
|
||||
decoder_input = self.project_to_decoder_in(
|
||||
|
|
|
@ -33,17 +33,15 @@ class CBHGTests(unittest.TestCase):
|
|||
class DecoderTests(unittest.TestCase):
|
||||
|
||||
def test_in_out(self):
|
||||
layer = Decoder(in_features=128, memory_dim=32, r=5)
|
||||
dummy_input = T.autograd.Variable(T.rand(4, 8, 128))
|
||||
dummy_memory = T.autograd.Variable(T.rand(4, 120, 32))
|
||||
layer = Decoder(in_features=256, memory_dim=80, r=2)
|
||||
dummy_input = T.autograd.Variable(T.rand(4, 8, 256))
|
||||
dummy_memory = T.autograd.Variable(T.rand(4, 2, 80))
|
||||
|
||||
print(layer)
|
||||
output, alignment = layer(dummy_input, dummy_memory)
|
||||
print(output.shape)
|
||||
|
||||
assert output.shape[0] == 4
|
||||
assert output.shape[1] == 120 / 5
|
||||
assert output.shape[2] == 32 * 5
|
||||
assert output.shape[1] == 1, "size not {}".format(output.shape[1])
|
||||
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
|
||||
|
||||
|
||||
class EncoderTests(unittest.TestCase):
|
||||
|
|
|
@ -72,8 +72,9 @@ class TestDataset(unittest.TestCase):
|
|||
c.power
|
||||
)
|
||||
|
||||
# Test for batch size 1
|
||||
dataloader = DataLoader(dataset, batch_size=1,
|
||||
shuffle=True, collate_fn=dataset.collate_fn,
|
||||
shuffle=False, collate_fn=dataset.collate_fn,
|
||||
drop_last=True, num_workers=c.num_loader_workers)
|
||||
|
||||
for i, data in enumerate(dataloader):
|
||||
|
@ -93,11 +94,53 @@ class TestDataset(unittest.TestCase):
|
|||
assert linear_input[0, -1].sum() == 0
|
||||
assert linear_input[0, -2].sum() != 0
|
||||
assert stop_target[0, -1] == 1
|
||||
assert stop_target[0, -2] == 0
|
||||
assert stop_target.sum() == 1
|
||||
assert len(mel_lengths.shape) == 1
|
||||
print(mel_lengths)
|
||||
print(mel_input)
|
||||
assert mel_lengths[0] == mel_input[0].shape[0]
|
||||
|
||||
# Test for batch size 2
|
||||
dataloader = DataLoader(dataset, batch_size=2,
|
||||
shuffle=False, collate_fn=dataset.collate_fn,
|
||||
drop_last=False, num_workers=c.num_loader_workers)
|
||||
|
||||
for i, data in enumerate(dataloader):
|
||||
if i == self.max_loader_iter:
|
||||
break
|
||||
text_input = data[0]
|
||||
text_lengths = data[1]
|
||||
linear_input = data[2]
|
||||
mel_input = data[3]
|
||||
mel_lengths = data[4]
|
||||
stop_target = data[5]
|
||||
item_idx = data[6]
|
||||
|
||||
if mel_lengths[0] > mel_lengths[1]:
|
||||
idx = 0
|
||||
else:
|
||||
idx = 1
|
||||
|
||||
# check the first item in the batch
|
||||
assert mel_input[idx, -1].sum() == 0
|
||||
assert mel_input[idx, -2].sum() != 0, mel_input
|
||||
assert linear_input[idx, -1].sum() == 0
|
||||
assert linear_input[idx, -2].sum() != 0
|
||||
assert stop_target[idx, -1] == 1
|
||||
assert stop_target[idx, -2] == 0
|
||||
assert stop_target[idx].sum() == 1
|
||||
assert len(mel_lengths.shape) == 1
|
||||
assert mel_lengths[idx] == mel_input[idx].shape[0]
|
||||
|
||||
# check the second itme in the batch
|
||||
assert mel_input[1-idx, -1].sum() == 0
|
||||
assert linear_input[1-idx, -1].sum() == 0
|
||||
assert stop_target[1-idx, -1] == 1
|
||||
assert len(mel_lengths.shape) == 1
|
||||
|
||||
# check batch conditions
|
||||
assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
|
||||
assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
10
train.py
10
train.py
|
@ -98,16 +98,6 @@ def train(model, criterion, data_loader, optimizer, epoch):
|
|||
mel_lengths_var = Variable(mel_lengths)
|
||||
linear_spec_var = Variable(linear_input, volatile=True)
|
||||
|
||||
# sort sequence by length for curriculum learning
|
||||
# TODO: might be unnecessary
|
||||
sorted_lengths, indices = torch.sort(
|
||||
text_lengths.view(-1), dim=0, descending=True)
|
||||
sorted_lengths = sorted_lengths.long().numpy()
|
||||
text_input_var = text_input_var[indices]
|
||||
mel_spec_var = mel_spec_var[indices]
|
||||
mel_lengths_var = mel_lengths_var[indices]
|
||||
linear_spec_var = linear_spec_var[indices]
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
text_input_var = text_input_var.cuda()
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
def pad_data(x, length):
|
||||
def _pad_data(x, length):
|
||||
_pad = 0
|
||||
assert x.ndim == 1
|
||||
return np.pad(x, (0, length - x.shape[0]),
|
||||
|
@ -11,30 +11,31 @@ def pad_data(x, length):
|
|||
|
||||
def prepare_data(inputs):
|
||||
max_len = max((len(x) for x in inputs))
|
||||
return np.stack([pad_data(x, max_len) for x in inputs])
|
||||
return np.stack([_pad_data(x, max_len) for x in inputs])
|
||||
|
||||
|
||||
def pad_tensor(x, length):
|
||||
def _pad_tensor(x, length):
|
||||
_pad = 0
|
||||
assert x.ndim == 2
|
||||
return np.pad(x, [[0, 0], [0, length- x.shape[1]]], mode='constant', constant_values=_pad)
|
||||
return np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad)
|
||||
|
||||
|
||||
def prepare_tensor(inputs):
|
||||
max_len = max((x.shape[1] for x in inputs))
|
||||
return np.stack([pad_tensor(x, max_len) for x in inputs])
|
||||
def prepare_tensor(inputs, out_steps):
|
||||
max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame
|
||||
remainder = max_len % out_steps
|
||||
return np.stack([_pad_tensor(x, max_len + remainder) for x in inputs])
|
||||
|
||||
|
||||
def pad_stop_target(x, length):
|
||||
def _pad_stop_target(x, length):
|
||||
_pad = 1.
|
||||
assert x.ndim == 1
|
||||
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
|
||||
|
||||
|
||||
def prepare_stop_target(inputs, out_steps):
|
||||
max_len = max((x.shape[0] for x in inputs))
|
||||
max_len = max((x.shape[0] for x in inputs)) + 1 # zero-frame
|
||||
remainder = max_len % out_steps
|
||||
return np.stack([pad_stop_target(x, max_len + out_steps - remainder) for x in inputs])
|
||||
return np.stack([_pad_stop_target(x, max_len + remainder) for x in inputs])
|
||||
|
||||
|
||||
def pad_per_step(inputs, pad_len):
|
||||
|
|
Loading…
Reference in New Issue