Conflicts:
	README.md
	best_model_config.json
	datasets/LJSpeech.py
	layers/tacotron.py
	notebooks/TacotronPlayGround.ipynb
	notebooks/utils.py
	tests/layers_tests.py
	tests/loader_tests.py
	tests/tacotron_tests.py
	train.py
	utils/generic_utils.py
This commit is contained in:
Eren Golge 2018-05-25 05:14:04 -07:00
commit 24644b20d4
14 changed files with 282 additions and 100 deletions

View File

@ -1,5 +1,5 @@
{
"model_name": "best_model",
"model_name": "best-model",
"num_mels": 80,
"num_freq": 1025,
"sample_rate": 20000,
@ -11,13 +11,13 @@
"embedding_size": 256,
"text_cleaner": "english_cleaners",
"epochs": 200,
"epochs": 1000,
"lr": 0.002,
"warmup_steps": 4000,
"batch_size": 32,
"eval_batch_size":32,
"r": 5,
"griffin_lim_iters": 60,
"power": 1.5,

133
datasets/TWEB.py Normal file
View File

@ -0,0 +1,133 @@
import os
import numpy as np
import collections
import librosa
import torch
from torch.utils.data import Dataset
from TTS.utils.text import text_to_sequence
from TTS.utils.audio import AudioProcessor
from TTS.utils.data import (prepare_data, pad_per_step,
prepare_tensor, prepare_stop_target)
class TWEBDataset(Dataset):
def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate,
text_cleaner, num_mels, min_level_db, frame_shift_ms,
frame_length_ms, preemphasis, ref_level_db, num_freq, power,
min_seq_len=0):
with open(csv_file, "r") as f:
self.frames = [line.split('\t') for line in f]
self.root_dir = root_dir
self.outputs_per_step = outputs_per_step
self.sample_rate = sample_rate
self.cleaners = text_cleaner
self.min_seq_len = min_seq_len
self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms,
frame_length_ms, preemphasis, ref_level_db, num_freq, power)
print(" > Reading TWEB from - {}".format(root_dir))
print(" | > Number of instances : {}".format(len(self.frames)))
self._sort_frames()
def load_wav(self, filename):
try:
audio = librosa.core.load(filename, sr=self.sample_rate)
return audio
except RuntimeError as e:
print(" !! Cannot read file : {}".format(filename))
def _sort_frames(self):
r"""Sort sequences in ascending order"""
lengths = np.array([len(ins[1]) for ins in self.frames])
print(" | > Max length sequence {}".format(np.max(lengths)))
print(" | > Min length sequence {}".format(np.min(lengths)))
print(" | > Avg length sequence {}".format(np.mean(lengths)))
idxs = np.argsort(lengths)
new_frames = []
ignored = []
for i, idx in enumerate(idxs):
length = lengths[idx]
if length < self.min_seq_len:
ignored.append(idx)
else:
new_frames.append(self.frames[idx])
print(" | > {} instances are ignored by min_seq_len ({})".format(
len(ignored), self.min_seq_len))
self.frames = new_frames
def __len__(self):
return len(self.frames)
def __getitem__(self, idx):
wav_name = os.path.join(self.root_dir,
self.frames[idx][0]) + '.wav'
text = self.frames[idx][1]
text = np.asarray(text_to_sequence(
text, [self.cleaners]), dtype=np.int32)
wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
return sample
def collate_fn(self, batch):
r"""
Perform preprocessing and create a final data batch:
1. PAD sequences with the longest sequence in the batch
2. Convert Audio signal to Spectrograms.
3. PAD sequences that can be divided by r.
4. Convert Numpy to Torch tensors.
"""
# Puts each data field into a tensor with outer dimension batch size
if isinstance(batch[0], collections.Mapping):
keys = list()
wav = [d['wav'] for d in batch]
item_idxs = [d['item_idx'] for d in batch]
text = [d['text'] for d in batch]
text_lenghts = np.array([len(x) for x in text])
max_text_len = np.max(text_lenghts)
linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame
# compute 'stop token' targets
stop_targets = [np.array([0.]*(mel_len-1))
for mel_len in mel_lengths]
# PAD stop targets
stop_targets = prepare_stop_target(
stop_targets, self.outputs_per_step)
# PAD sequences with largest length of the batch
text = prepare_data(text).astype(np.int32)
wav = prepare_data(wav)
# PAD features with largest length + a zero frame
linear = prepare_tensor(linear, self.outputs_per_step)
mel = prepare_tensor(mel, self.outputs_per_step)
assert mel.shape[2] == linear.shape[2]
timesteps = mel.shape[2]
# B x T x D
linear = linear.transpose(0, 2, 1)
mel = mel.transpose(0, 2, 1)
# convert things to pytorch
text_lenghts = torch.LongTensor(text_lenghts)
text = torch.LongTensor(text)
linear = torch.FloatTensor(linear)
mel = torch.FloatTensor(mel)
mel_lengths = torch.LongTensor(mel_lengths)
stop_targets = torch.FloatTensor(stop_targets)
return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[0]
raise TypeError(("batch must contain tensors, numbers, dicts or lists;\
found {}"
.format(type(batch[0]))))

View File

@ -1,6 +1,5 @@
# coding: utf-8
import torch
from torch.autograd import Variable
from torch import nn

View File

@ -48,6 +48,7 @@ class L1LossMasked(nn.Module):
reduce=False)
# losses: (batch, max_len, dim)
losses = losses_flat.view(*target.size())
# mask: (batch, max_len, 1)
mask = _sequence_mask(sequence_length=length,
max_len=target.size(1)).unsqueeze(2)

View File

@ -333,5 +333,4 @@ class StopNet(nn.Module):
outputs = self.relu(rnn_hidden)
outputs = self.linear(outputs)
outputs = self.sigmoid(outputs)
return outputs, rnn_hidden
return outputs, rnn_hidden

0
tests/__init__.py Normal file
View File

View File

@ -37,11 +37,14 @@ class DecoderTests(unittest.TestCase):
dummy_input = T.rand(4, 8, 256)
dummy_memory = T.rand(4, 2, 80)
output, alignment = layer(dummy_input, dummy_memory)
output, alignment, stop_tokens = layer(dummy_input, dummy_memory)
assert output.shape[0] == 4
assert output.shape[1] == 1, "size not {}".format(output.shape[1])
assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2])
assert stop_tokens.shape[0] == 4
assert stop_tokens.max() <= 1.0
assert stop_tokens.min() >= 0
class EncoderTests(unittest.TestCase):
@ -73,7 +76,6 @@ class L1LossMaskedTests(unittest.TestCase):
dummy_length = (T.ones(4) * 8).long()
output = layer(dummy_input, dummy_target, dummy_length)
assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])
dummy_input = T.ones(4, 8, 128).float()
dummy_target = T.zeros(4, 8, 128).float()
dummy_length = (T.arange(5, 9)).long()

View File

@ -5,8 +5,6 @@ import numpy as np
from torch.utils.data import DataLoader
from TTS.utils.generic_utils import load_config
from TTS.datasets.LJSpeech import LJSpeechDataset
# from TTS.datasets.TWEB import TWEBDataset
file_path = os.path.dirname(os.path.realpath(__file__))
c = load_config(os.path.join(file_path, 'test_config.json'))
@ -19,8 +17,8 @@ class TestLJSpeechDataset(unittest.TestCase):
self.max_loader_iter = 4
def test_loader(self):
dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
os.path.join(c.data_path, 'wavs'),
dataset = LJSpeechDataset(os.path.join(c.data_path_LJSpeech, 'metadata.csv'),
os.path.join(c.data_path_LJSpeech, 'wavs'),
c.r,
c.sample_rate,
c.text_cleaner,
@ -59,8 +57,8 @@ class TestLJSpeechDataset(unittest.TestCase):
assert mel_input.shape[2] == c.num_mels
def test_padding(self):
dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
os.path.join(c.data_path, 'wavs'),
dataset = LJSpeechDataset(os.path.join(c.data_path_LJSpeech, 'metadata.csv'),
os.path.join(c.data_path_LJSpeech, 'wavs'),
1,
c.sample_rate,
c.text_cleaner,
@ -274,4 +272,4 @@ class TestLJSpeechDataset(unittest.TestCase):
# # check batch conditions
# assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
# assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
# assert (linear_input * stop_target.unsqueeze(2)).sum() == 0

View File

@ -26,8 +26,13 @@ class TacotronTrainTest(unittest.TestCase):
linear_spec = torch.rand(8, 30, c.num_freq).to(device)
mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input.shape[0], stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float()
criterion = L1LossMasked().to(device)
criterion_st = nn.BCELoss().to(device)
model = Tacotron(c.embedding_size,
@ -42,21 +47,20 @@ class TacotronTrainTest(unittest.TestCase):
count += 1
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5):
mel_out, linear_out, align = model.forward(input, mel_spec)
# mel_out, linear_out, align, stop_tokens = model.forward(input, mel_spec)
# assert stop_tokens.data.max() <= 1.0
# assert stop_tokens.data.min() >= 0.0
mel_out, linear_out, align, stop_tokens = model.forward(input, mel_spec)
assert stop_tokens.data.max() <= 1.0
assert stop_tokens.data.min() >= 0.0
optimizer.zero_grad()
loss = criterion(mel_out, mel_spec, mel_lengths)
# stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec, mel_lengths)
stop_loss = criterion_st(stop_tokens, stop_targets)
loss = loss + criterion(linear_out, linear_spec, mel_lengths) + stop_loss
loss.backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
if count not in [139, 59]:
if count not in [145, 59]:
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(count, param.shape, param, param_ref)
count += 1

View File

@ -1,30 +1,34 @@
{
"num_mels": 80,
"num_freq": 1025,
"sample_rate": 20000,
"frame_length_ms": 50,
"frame_shift_ms": 12.5,
"preemphasis": 0.97,
"min_level_db": -100,
"ref_level_db": 20,
"hidden_size": 128,
"embedding_size": 256,
"text_cleaner": "english_cleaners",
{
"num_mels": 80,
"num_freq": 1025,
"sample_rate": 22050,
"frame_length_ms": 50,
"frame_shift_ms": 12.5,
"preemphasis": 0.97,
"min_level_db": -100,
"ref_level_db": 20,
"hidden_size": 128,
"embedding_size": 256,
"text_cleaner": "english_cleaners",
"epochs": 2000,
"lr": 0.003,
"lr_patience": 5,
"lr_decay": 0.5,
"batch_size": 2,
"r": 5,
"epochs": 2000,
"lr": 0.003,
"lr_patience": 5,
"lr_decay": 0.5,
"batch_size": 2,
"r": 5,
"mk": 1.0,
"priority_freq": false,
"griffin_lim_iters": 60,
"power": 1.5,
"num_loader_workers": 4,
"griffin_lim_iters": 60,
"power": 1.5,
"save_step": 200,
"data_path": "/data/shared/KeithIto/LJSpeech-1.0",
"output_path": "result",
"log_dir": "/home/erogol/projects/TTS/logs/"
}
"num_loader_workers": 4,
"save_step": 200,
"data_path_LJSpeech": "/data/shared/KeithIto/LJSpeech-1.0",
"data_path_TWEB": "/data/shared/BibleSpeech",
"output_path": "result",
"log_dir": "/home/erogol/projects/TTS/logs/"
}

View File

@ -13,18 +13,16 @@ import numpy as np
import torch.nn as nn
from torch import optim
from torch import onnx
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tensorboardX import SummaryWriter
from utils.generic_utils import (Progbar, remove_experiment_folder,
create_experiment_folder, save_checkpoint,
save_best_model, load_config, lr_decay,
count_parameters, check_update, get_commit_hash)
count_parameters, check_update, get_commit_hash,
create_attn_mask, mk_decay)
from utils.model import get_param_size
from utils.visual import plot_alignment, plot_spectrogram
from datasets.LJSpeech import LJSpeechDataset
from models.tacotron import Tacotron
from layers.losses import L1LossMasked
@ -67,15 +65,15 @@ def train(model, criterion, criterion_st, data_loader, optimizer, optimizer_st,
avg_stop_loss = 0
print(" | > Epoch {}/{}".format(epoch, c.epochs))
progbar = Progbar(len(data_loader.dataset) / c.batch_size)
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
progbar_display = {}
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# setup input data
text_input = data[0]
text_lengths = data[1]
linear_input = data[2]
mel_input = data[3]
linear_spec = data[2]
mel_spec = data[3]
mel_lengths = data[4]
stop_targets = data[5]
@ -140,6 +138,12 @@ def train(model, criterion, criterion_st, data_loader, optimizer, optimizer_st,
step_time = time.time() - start_time
epoch_time += step_time
progbar_display['total_loss'] = loss.item()
progbar_display['linear_loss'] = linear_loss.item()
progbar_display['mel_loss'] = mel_loss.item()
progbar_display['stop_loss'] = stop_loss.item()
progbar_display['grad_norm'] = grad_norm.item()
# update
progbar.update(num_iter+1, values=[('total_loss', loss.item()),
@ -204,6 +208,7 @@ def train(model, criterion, criterion_st, data_loader, optimizer, optimizer_st,
# Plot Training Epoch Stats
tb.add_scalar('TrainEpochLoss/TotalLoss', avg_total_loss, current_step)
tb.add_scalar('TrainEpochLoss/LinearLoss', avg_linear_loss, current_step)
tb.add_scalar('TrainEpochLoss/StopLoss', avg_stop_loss, current_step)
tb.add_scalar('TrainEpochLoss/MelLoss', avg_mel_loss, current_step)
tb.add_scalar('TrainEpochLoss/StopLoss', avg_stop_loss, current_step)
tb.add_scalar('Time/EpochTime', epoch_time, epoch)
@ -272,7 +277,7 @@ def evaluate(model, criterion, criterion_st, data_loader, current_step):
avg_stop_loss += stop_loss.item()
# Diagnostic visualizations
idx = np.random.randint(mel_input.shape[0])
idx = np.random.randint(mel_spec.shape[0])
const_spec = linear_output[idx].data.cpu().numpy()
gt_spec = linear_input[idx].data.cpu().numpy()
align_img = alignments[idx].data.cpu().numpy()
@ -309,48 +314,50 @@ def evaluate(model, criterion, criterion_st, data_loader, current_step):
tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step)
tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step)
tb.add_scalar('ValEpochLoss/Stop_loss', avg_stop_loss, current_step)
return avg_linear_loss
def main(args):
print(" > Using dataset: {}".format(c.dataset))
mod = importlib.import_module('datasets.{}'.format(c.dataset))
Dataset = getattr(mod, c.dataset+"Dataset")
# Setup the dataset
train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'),
os.path.join(c.data_path, 'wavs'),
c.r,
c.sample_rate,
c.text_cleaner,
c.num_mels,
c.min_level_db,
c.frame_shift_ms,
c.frame_length_ms,
c.preemphasis,
c.ref_level_db,
c.num_freq,
c.power,
min_seq_len=c.min_seq_len
)
train_dataset = Dataset(os.path.join(c.data_path, c.meta_file_train),
os.path.join(c.data_path, 'wavs'),
c.r,
c.sample_rate,
c.text_cleaner,
c.num_mels,
c.min_level_db,
c.frame_shift_ms,
c.frame_length_ms,
c.preemphasis,
c.ref_level_db,
c.num_freq,
c.power,
min_seq_len=c.min_seq_len
)
train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
shuffle=False, collate_fn=train_dataset.collate_fn,
drop_last=False, num_workers=c.num_loader_workers,
drop_last=True, num_workers=c.num_loader_workers,
pin_memory=True)
val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
os.path.join(c.data_path, 'wavs'),
c.r,
c.sample_rate,
c.text_cleaner,
c.num_mels,
c.min_level_db,
c.frame_shift_ms,
c.frame_length_ms,
c.preemphasis,
c.ref_level_db,
c.num_freq,
c.power
)
val_dataset = Dataset(os.path.join(c.data_path, c.meta_file_val),
os.path.join(c.data_path, 'wavs'),
c.r,
c.sample_rate,
c.text_cleaner,
c.num_mels,
c.min_level_db,
c.frame_shift_ms,
c.frame_length_ms,
c.preemphasis,
c.ref_level_db,
c.num_freq,
c.power
)
val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size,
shuffle=False, collate_fn=val_dataset.collate_fn,
@ -385,7 +392,8 @@ def main(args):
optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr)
else:
args.restore_step = 0
print("\n > Starting a new training")
optimizer = optim.Adam(model.parameters(), lr=c.lr)
print(" > Starting a new training")
if use_cuda:
model = nn.DataParallel(model.cuda())

View File

@ -1,6 +1,7 @@
import os
import librosa
import pickle
import copy
import numpy as np
from scipy import signal
@ -38,15 +39,15 @@ class AudioProcessor(object):
return librosa.filters.mel(self.sample_rate, n_fft, n_mels=self.num_mels)
def _normalize(self, S):
return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1)
return np.clip((S - self.min_level_db) / -self.min_level_db, 1e-8, 1)
def _denormalize(self, S):
return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db
def _stft_parameters(self, ):
n_fft = (self.num_freq - 1) * 2
hop_length = int(self.frame_shift_ms / 1000 * self.sample_rate)
win_length = int(self.frame_length_ms / 1000 * self.sample_rate)
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate)
return n_fft, hop_length, win_length
def _amp_to_db(self, x):
@ -73,16 +74,29 @@ class AudioProcessor(object):
# Reconstruct phase
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
# def _griffin_lim(self, S):
# '''librosa implementation of Griffin-Lim
# Based on https://github.com/librosa/librosa/issues/434
# '''
# angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
# S_complex = np.abs(S).astype(np.complex)
# y = self._istft(S_complex * angles)
# for i in range(self.griffin_lim_iters):
# angles = np.exp(1j * np.angle(self._stft(y)))
# y = self._istft(S_complex * angles)
# return y
def _griffin_lim(self, S):
'''librosa implementation of Griffin-Lim
Based on https://github.com/librosa/librosa/issues/434
'''Applies Griffin-Lim's raw.
'''
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
y = self._istft(S_complex * angles)
S_best = copy.deepcopy(S)
for i in range(self.griffin_lim_iters):
angles = np.exp(1j * np.angle(self._stft(y)))
y = self._istft(S_complex * angles)
S_t = self._istft(S_best)
est = self._stft(S_t)
phase = est / np.maximum(1e-8, np.abs(est))
S_best = S * phase
S_t = self._istft(S_best)
y = np.real(S_t)
return y
def melspectrogram(self, y):
@ -96,7 +110,7 @@ class AudioProcessor(object):
def _istft(self, y):
_, hop_length, win_length = self._stft_parameters()
return librosa.istft(y, hop_length=hop_length, win_length=win_length)
return librosa.istft(y, hop_length=hop_length, win_length=win_length, window='hann')
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(self.sample_rate * min_silence_sec)

View File

@ -9,6 +9,7 @@ import torch
import subprocess
import numpy as np
from collections import OrderedDict
from torch.autograd import Variable
class AttrDict(dict):
@ -134,6 +135,25 @@ def lr_decay(init_lr, global_step, warmup_steps):
return lr
def create_attn_mask(N, T, g=0.05):
r'''creating attn mask for guided attention
TODO: vectorize'''
M = np.zeros([N, T])
for t in range(T):
for n in range(N):
val = 20 * np.exp(-pow((n/N)-(t/T), 2.0)/g)
M[n, t] = val
e_x = np.exp(M - np.max(M))
M = e_x / e_x.sum(axis=0) # only difference
M = torch.FloatTensor(M).t().cuda()
M = torch.stack([M]*32)
return M
def mk_decay(init_mk, max_epoch, n_epoch):
return init_mk * ((max_epoch - n_epoch) / max_epoch)
def count_parameters(model):
r"""Count number of trainable parameters in a network"""
return sum(p.numel() for p in model.parameters() if p.requires_grad)

View File

@ -32,4 +32,4 @@ def plot_spectrogram(linear_output, audio):
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
return data