mirror of https://github.com/coqui-ai/TTS.git
split train and validation steps
This commit is contained in:
parent
793563b586
commit
021ac3978d
|
@ -20,11 +20,10 @@
|
||||||
"griffin_lim_iters": 60,
|
"griffin_lim_iters": 60,
|
||||||
"power": 1.5,
|
"power": 1.5,
|
||||||
|
|
||||||
"num_loader_workers": 32,
|
"num_loader_workers": 16,
|
||||||
|
|
||||||
"checkpoint": false,
|
"checkpoint": false,
|
||||||
"save_step": 69,
|
"save_step": 69,
|
||||||
"data_path": "/data/shared/KeithIto/LJSpeech-1.0",
|
"data_path": "/run/shm/erogol/LJSpeech-1.0",
|
||||||
"output_path": "result",
|
"output_path": "result",
|
||||||
"log_dir": "/home/erogol/projects/TTS/logs/"
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,16 +16,15 @@ class LJSpeechDataset(Dataset):
|
||||||
text_cleaner, num_mels, min_level_db, frame_shift_ms,
|
text_cleaner, num_mels, min_level_db, frame_shift_ms,
|
||||||
frame_length_ms, preemphasis, ref_level_db, num_freq, power):
|
frame_length_ms, preemphasis, ref_level_db, num_freq, power):
|
||||||
|
|
||||||
f = open(csv_file, "r")
|
with open(csv_file, "r") as f:
|
||||||
self.frames = [line.split('|') for line in f]
|
self.frames = [line.split('|') for line in f]
|
||||||
f.close()
|
self.frames = self.frames[:256]
|
||||||
self.root_dir = root_dir
|
self.root_dir = root_dir
|
||||||
self.outputs_per_step = outputs_per_step
|
self.outputs_per_step = outputs_per_step
|
||||||
self.sample_rate = sample_rate
|
self.sample_rate = sample_rate
|
||||||
self.cleaners = text_cleaner
|
self.cleaners = text_cleaner
|
||||||
self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms,
|
self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms,
|
||||||
frame_length_ms, preemphasis, ref_level_db, num_freq, power
|
frame_length_ms, preemphasis, ref_level_db, num_freq, power)
|
||||||
)
|
|
||||||
print(" > Reading LJSpeech from - {}".format(root_dir))
|
print(" > Reading LJSpeech from - {}".format(root_dir))
|
||||||
print(" | > Number of instances : {}".format(len(self.frames)))
|
print(" | > Number of instances : {}".format(len(self.frames)))
|
||||||
|
|
||||||
|
@ -41,11 +40,11 @@ class LJSpeechDataset(Dataset):
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
wav_name = os.path.join(self.root_dir,
|
wav_name = os.path.join(self.root_dir,
|
||||||
self.frames.ix[idx, 0]) + '.wav'
|
self.frames[idx][0]) + '.wav'
|
||||||
text = self.frames[idx][1]
|
text = self.frames[idx][1]
|
||||||
text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32)
|
text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32)
|
||||||
wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
|
wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
|
||||||
sample = {'text': text, 'wav': wav, 'item_idx': self.frames.ix[idx, 0]}
|
sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
|
||||||
return sample
|
return sample
|
||||||
|
|
||||||
def get_dummy_data(self):
|
def get_dummy_data(self):
|
||||||
|
|
479
train.py
479
train.py
|
@ -27,36 +27,265 @@ from utils.visual import plot_alignment, plot_spectrogram
|
||||||
from datasets.LJSpeech import LJSpeechDataset
|
from datasets.LJSpeech import LJSpeechDataset
|
||||||
from models.tacotron import Tacotron
|
from models.tacotron import Tacotron
|
||||||
|
|
||||||
|
|
||||||
use_cuda = torch.cuda.is_available()
|
use_cuda = torch.cuda.is_available()
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--restore_step', type=int,
|
||||||
|
help='Global step to restore checkpoint', default=0)
|
||||||
|
parser.add_argument('--restore_path', type=str,
|
||||||
|
help='Folder path to checkpoints', default=0)
|
||||||
|
parser.add_argument('--config_path', type=str,
|
||||||
|
help='path to config file for training',)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# setup output paths and read configs
|
||||||
|
c = load_config(args.config_path)
|
||||||
|
_ = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
OUT_PATH = os.path.join(_, c.output_path)
|
||||||
|
OUT_PATH = create_experiment_folder(OUT_PATH)
|
||||||
|
CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
|
||||||
|
shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))
|
||||||
|
|
||||||
|
# save config to tmp place to be loaded by subsequent modules.
|
||||||
|
file_name = str(os.getpid())
|
||||||
|
tmp_path = os.path.join("/tmp/", file_name+'_tts')
|
||||||
|
pickle.dump(c, open(tmp_path, "wb"))
|
||||||
|
|
||||||
|
# setup tensorboard
|
||||||
|
LOG_DIR = OUT_PATH
|
||||||
|
tb = SummaryWriter(LOG_DIR)
|
||||||
|
|
||||||
|
|
||||||
|
def signal_handler(signal, frame):
|
||||||
|
"""Ctrl+C handler to remove empty experiment folder"""
|
||||||
|
print(" !! Pressed Ctrl+C !!")
|
||||||
|
remove_experiment_folder(OUT_PATH)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def train(model, criterion, data_loader, optimizer, epoch):
|
||||||
|
model = model.train()
|
||||||
|
epoch_time = 0
|
||||||
|
|
||||||
|
print(" | > Epoch {}/{}".format(epoch, c.epochs))
|
||||||
|
progbar = Progbar(len(data_loader.dataset) / c.batch_size)
|
||||||
|
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
||||||
|
for num_iter, data in enumerate(data_loader):
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# setup input data
|
||||||
|
text_input = data[0]
|
||||||
|
text_lengths = data[1]
|
||||||
|
linear_input = data[2]
|
||||||
|
mel_input = data[3]
|
||||||
|
|
||||||
|
current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1
|
||||||
|
|
||||||
|
# setup lr
|
||||||
|
current_lr = lr_decay(c.lr, current_step, c.warmup_steps)
|
||||||
|
for params_group in optimizer.param_groups:
|
||||||
|
params_group['lr'] = current_lr
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
# convert inputs to variables
|
||||||
|
text_input_var = Variable(text_input)
|
||||||
|
mel_spec_var = Variable(mel_input)
|
||||||
|
linear_spec_var = Variable(linear_input, volatile=True)
|
||||||
|
|
||||||
|
# sort sequence by length for curriculum learning
|
||||||
|
# TODO: might be unnecessary
|
||||||
|
sorted_lengths, indices = torch.sort(
|
||||||
|
text_lengths.view(-1), dim=0, descending=True)
|
||||||
|
sorted_lengths = sorted_lengths.long().numpy()
|
||||||
|
text_input_var = text_input_var[indices]
|
||||||
|
mel_spec_var = mel_spec_var[indices]
|
||||||
|
linear_spec_var = linear_spec_var[indices]
|
||||||
|
|
||||||
|
# dispatch data to GPU
|
||||||
|
if use_cuda:
|
||||||
|
text_input_var = text_input_var.cuda()
|
||||||
|
mel_spec_var = mel_spec_var.cuda()
|
||||||
|
linear_spec_var = linear_spec_var.cuda()
|
||||||
|
|
||||||
|
# forward pass
|
||||||
|
mel_output, linear_output, alignments =\
|
||||||
|
model.forward(text_input_var, mel_spec_var,
|
||||||
|
input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths)))
|
||||||
|
|
||||||
|
# loss computation
|
||||||
|
mel_loss = criterion(mel_output, mel_spec_var)
|
||||||
|
linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
|
||||||
|
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
||||||
|
linear_spec_var[: ,: ,:n_priority_freq])
|
||||||
|
loss = mel_loss + linear_loss
|
||||||
|
|
||||||
|
# backpass and check the grad norm
|
||||||
|
loss.backward()
|
||||||
|
grad_norm, skip_flag = check_update(model, 0.5, 100)
|
||||||
|
if skip_flag:
|
||||||
|
optimizer.zero_grad()
|
||||||
|
print(" | > Iteration skipped!!")
|
||||||
|
continue
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
step_time = time.time() - start_time
|
||||||
|
epoch_time += step_time
|
||||||
|
|
||||||
|
# update
|
||||||
|
progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
|
||||||
|
('linear_loss', linear_loss.data[0]),
|
||||||
|
('mel_loss', mel_loss.data[0]),
|
||||||
|
('grad_norm', grad_norm)])
|
||||||
|
|
||||||
|
# Plot Training Iter Stats
|
||||||
|
tb.add_scalar('TrainIterLoss/TotalLoss', loss.data[0], current_step)
|
||||||
|
tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.data[0],
|
||||||
|
current_step)
|
||||||
|
tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.data[0], current_step)
|
||||||
|
tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
|
||||||
|
current_step)
|
||||||
|
tb.add_scalar('Params/GradNorm', grad_norm, current_step)
|
||||||
|
tb.add_scalar('Time/StepTime', step_time, current_step)
|
||||||
|
|
||||||
|
if current_step % c.save_step == 0:
|
||||||
|
if c.checkpoint:
|
||||||
|
# save model
|
||||||
|
save_checkpoint(model, optimizer, linear_loss.data[0],
|
||||||
|
OUT_PATH, current_step, epoch)
|
||||||
|
|
||||||
|
# Diagnostic visualizations
|
||||||
|
const_spec = linear_output[0].data.cpu().numpy()
|
||||||
|
gt_spec = linear_spec_var[0].data.cpu().numpy()
|
||||||
|
|
||||||
|
const_spec = plot_spectrogram(const_spec, dataset.ap)
|
||||||
|
gt_spec = plot_spectrogram(gt_spec, dataset.ap)
|
||||||
|
tb.add_image('Visual/Reconstruction', const_spec, current_step)
|
||||||
|
tb.add_image('Visual/GroundTruth', gt_spec, current_step)
|
||||||
|
|
||||||
|
align_img = alignments[0].data.cpu().numpy()
|
||||||
|
align_img = plot_alignment(align_img)
|
||||||
|
tb.add_image('Visual/Alignment', align_img, current_step)
|
||||||
|
|
||||||
|
# Sample audio
|
||||||
|
audio_signal = linear_output[0].data.cpu().numpy()
|
||||||
|
dataset.ap.griffin_lim_iters = 60
|
||||||
|
audio_signal = dataset.ap.inv_spectrogram(audio_signal.T)
|
||||||
|
try:
|
||||||
|
tb.add_audio('SampleAudio', audio_signal, current_step,
|
||||||
|
sample_rate=c.sample_rate)
|
||||||
|
except:
|
||||||
|
print("\n > Error at audio signal on TB!!")
|
||||||
|
print(audio_signal.max())
|
||||||
|
print(audio_signal.min())
|
||||||
|
|
||||||
|
avg_linear_loss = np.mean(
|
||||||
|
progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1]))
|
||||||
|
avg_mel_loss = np.mean(
|
||||||
|
progbar.sum_values['mel_loss'][0] / max(1, progbar.sum_values['mel_loss'][1]))
|
||||||
|
avg_total_loss = avg_mel_loss + avg_linear_loss
|
||||||
|
|
||||||
|
# Plot Training Epoch Stats
|
||||||
|
tb.add_scalar('TrainEpochLoss/TotalLoss', loss.data[0], current_step)
|
||||||
|
tb.add_scalar('TrainEpochLoss/LinearLoss', linear_loss.data[0], current_step)
|
||||||
|
tb.add_scalar('TrainEpochLoss/MelLoss', mel_loss.data[0], current_step)
|
||||||
|
tb.add_scalar('Time/EpochTime', epoch_time, epoch)
|
||||||
|
epoch_time = 0
|
||||||
|
|
||||||
|
return avg_linear_loss, current_step
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(model, criterion, data_loader, current_step):
|
||||||
|
model = model.train()
|
||||||
|
epoch_time = 0
|
||||||
|
|
||||||
|
print("\n | > Validation")
|
||||||
|
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
||||||
|
progbar = Progbar(len(data_loader.dataset) / c.batch_size)
|
||||||
|
|
||||||
|
for num_iter, data in enumerate(data_loader):
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# setup input data
|
||||||
|
text_input = data[0]
|
||||||
|
text_lengths = data[1]
|
||||||
|
linear_input = data[2]
|
||||||
|
mel_input = data[3]
|
||||||
|
|
||||||
|
# convert inputs to variables
|
||||||
|
text_input_var = Variable(text_input)
|
||||||
|
mel_spec_var = Variable(mel_input)
|
||||||
|
linear_spec_var = Variable(linear_input, volatile=True)
|
||||||
|
|
||||||
|
# dispatch data to GPU
|
||||||
|
if use_cuda:
|
||||||
|
text_input_var = text_input_var.cuda()
|
||||||
|
mel_spec_var = mel_spec_var.cuda()
|
||||||
|
linear_spec_var = linear_spec_var.cuda()
|
||||||
|
|
||||||
|
# forward pass
|
||||||
|
mel_output, linear_output, alignments =\
|
||||||
|
model.forward(text_input_var, mel_spec_var)
|
||||||
|
|
||||||
|
# loss computation
|
||||||
|
mel_loss = criterion(mel_output, mel_spec_var)
|
||||||
|
linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
|
||||||
|
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
||||||
|
linear_spec_var[: ,: ,:n_priority_freq])
|
||||||
|
loss = mel_loss + linear_loss
|
||||||
|
|
||||||
|
step_time = time.time() - start_time
|
||||||
|
epoch_time += step_time
|
||||||
|
|
||||||
|
# update
|
||||||
|
progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
|
||||||
|
('linear_loss', linear_loss.data[0]),
|
||||||
|
('mel_loss', mel_loss.data[0])])
|
||||||
|
|
||||||
|
# Diagnostic visualizations
|
||||||
|
idx = np.random.randint(c.batch_size)
|
||||||
|
const_spec = linear_output[idx].data.cpu().numpy()
|
||||||
|
gt_spec = linear_spec_var[idx].data.cpu().numpy()
|
||||||
|
|
||||||
|
const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap)
|
||||||
|
gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap)
|
||||||
|
tb.add_image('ValVisual/Reconstruction', const_spec, current_step)
|
||||||
|
tb.add_image('ValVisual/GroundTruth', gt_spec, current_step)
|
||||||
|
|
||||||
|
align_img = alignments[idx].data.cpu().numpy()
|
||||||
|
align_img = plot_alignment(align_img)
|
||||||
|
tb.add_image('ValVisual/ValidationAlignment', align_img, current_step)
|
||||||
|
|
||||||
|
# Sample audio
|
||||||
|
audio_signal = linear_output[idx].data.cpu().numpy()
|
||||||
|
data_loader.dataset.ap.griffin_lim_iters = 60
|
||||||
|
audio_signal = data_loader.dataset.ap.inv_spectrogram(audio_signal.T)
|
||||||
|
try:
|
||||||
|
tb.add_audio('ValSampleAudio', audio_signal, current_step,
|
||||||
|
sample_rate=c.sample_rate)
|
||||||
|
except:
|
||||||
|
print("\n > Error at audio signal on TB!!")
|
||||||
|
print(audio_signal.max())
|
||||||
|
print(audio_signal.min())
|
||||||
|
|
||||||
|
# compute average losses
|
||||||
|
avg_linear_loss = np.mean(
|
||||||
|
progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1]))
|
||||||
|
avg_mel_loss = np.mean(
|
||||||
|
progbar.sum_values['mel_loss'][0] / max(1, progbar.sum_values['mel_loss'][1]))
|
||||||
|
avg_total_loss = avg_mel_loss + avg_linear_loss
|
||||||
|
|
||||||
|
# Plot Learning Stats
|
||||||
|
tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step)
|
||||||
|
tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step)
|
||||||
|
tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step)
|
||||||
|
return avg_linear_loss
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
|
||||||
# setup output paths and read configs
|
train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'),
|
||||||
c = load_config(args.config_path)
|
|
||||||
_ = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
OUT_PATH = os.path.join(_, c.output_path)
|
|
||||||
OUT_PATH = create_experiment_folder(OUT_PATH)
|
|
||||||
CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
|
|
||||||
shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))
|
|
||||||
|
|
||||||
# save config to tmp place to be loaded by subsequent modules.
|
|
||||||
file_name = str(os.getpid())
|
|
||||||
tmp_path = os.path.join("/tmp/", file_name+'_tts')
|
|
||||||
pickle.dump(c, open(tmp_path, "wb"))
|
|
||||||
|
|
||||||
# setup tensorboard
|
|
||||||
LOG_DIR = OUT_PATH
|
|
||||||
tb = SummaryWriter(LOG_DIR)
|
|
||||||
|
|
||||||
# Ctrl+C handler to remove empty experiment folder
|
|
||||||
def signal_handler(signal, frame):
|
|
||||||
print(" !! Pressed Ctrl+C !!")
|
|
||||||
remove_experiment_folder(OUT_PATH)
|
|
||||||
sys.exit(1)
|
|
||||||
signal.signal(signal.SIGINT, signal_handler)
|
|
||||||
|
|
||||||
# Setup the dataset
|
|
||||||
dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
|
|
||||||
os.path.join(c.data_path, 'wavs'),
|
os.path.join(c.data_path, 'wavs'),
|
||||||
c.r,
|
c.r,
|
||||||
c.sample_rate,
|
c.sample_rate,
|
||||||
|
@ -71,28 +300,43 @@ def main(args):
|
||||||
c.power
|
c.power
|
||||||
)
|
)
|
||||||
|
|
||||||
dataloader = DataLoader(dataset, batch_size=c.batch_size,
|
train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
|
||||||
shuffle=True, collate_fn=dataset.collate_fn,
|
shuffle=True, collate_fn=train_dataset.collate_fn,
|
||||||
drop_last=True, num_workers=c.num_loader_workers,
|
drop_last=True, num_workers=c.num_loader_workers,
|
||||||
pin_memory=True)
|
pin_memory=True)
|
||||||
|
|
||||||
# setup the model
|
val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
|
||||||
model = Tacotron(c.embedding_size,
|
os.path.join(c.data_path, 'wavs'),
|
||||||
|
c.r,
|
||||||
|
c.sample_rate,
|
||||||
|
c.text_cleaner,
|
||||||
|
c.num_mels,
|
||||||
|
c.min_level_db,
|
||||||
|
c.frame_shift_ms,
|
||||||
|
c.frame_length_ms,
|
||||||
|
c.preemphasis,
|
||||||
|
c.ref_level_db,
|
||||||
|
c.num_freq,
|
||||||
|
c.power
|
||||||
|
)
|
||||||
|
|
||||||
|
val_loader = DataLoader(val_dataset, batch_size=c.batch_size,
|
||||||
|
shuffle=True, collate_fn=val_dataset.collate_fn,
|
||||||
|
drop_last=True, num_workers= 4,
|
||||||
|
pin_memory=True)
|
||||||
|
|
||||||
c.hidden_size,
|
c.hidden_size,
|
||||||
c.num_mels,
|
c.num_mels,
|
||||||
c.num_freq,
|
c.num_freq,
|
||||||
c.r)
|
c.r)
|
||||||
|
|
||||||
# plot model on tensorboard
|
|
||||||
dummy_input = dataset.get_dummy_data()
|
|
||||||
|
|
||||||
## TODO: onnx does not support RNN fully yet
|
|
||||||
# model_proto_path = os.path.join(OUT_PATH, "model.proto")
|
|
||||||
# onnx.export(model, dummy_input, model_proto_path, verbose=True)
|
|
||||||
# tb.add_graph_onnx(model_proto_path)
|
|
||||||
|
|
||||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||||
|
|
||||||
|
if use_cuda:
|
||||||
|
criterion = nn.L1Loss().cuda()
|
||||||
|
else:
|
||||||
|
criterion = nn.L1Loss()
|
||||||
|
|
||||||
if args.restore_step:
|
if args.restore_step:
|
||||||
checkpoint = torch.load(os.path.join(
|
checkpoint = torch.load(os.path.join(
|
||||||
args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step))
|
args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step))
|
||||||
|
@ -119,168 +363,19 @@ def main(args):
|
||||||
num_params = count_parameters(model)
|
num_params = count_parameters(model)
|
||||||
print(" | > Model has {} parameters".format(num_params))
|
print(" | > Model has {} parameters".format(num_params))
|
||||||
|
|
||||||
model = model.train()
|
|
||||||
|
|
||||||
if not os.path.exists(CHECKPOINT_PATH):
|
if not os.path.exists(CHECKPOINT_PATH):
|
||||||
os.mkdir(CHECKPOINT_PATH)
|
os.mkdir(CHECKPOINT_PATH)
|
||||||
|
|
||||||
if use_cuda:
|
|
||||||
criterion = nn.L1Loss().cuda()
|
|
||||||
else:
|
|
||||||
criterion = nn.L1Loss()
|
|
||||||
|
|
||||||
n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)
|
|
||||||
|
|
||||||
#lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay,
|
|
||||||
# patience=c.lr_patience, verbose=True)
|
|
||||||
epoch_time = 0
|
|
||||||
if 'best_loss' not in locals():
|
if 'best_loss' not in locals():
|
||||||
best_loss = float('inf')
|
best_loss = float('inf')
|
||||||
|
|
||||||
for epoch in range(0, c.epochs):
|
for epoch in range(0, c.epochs):
|
||||||
|
train_loss, current_step = train(model, criterion, train_loader, optimizer, epoch)
|
||||||
print("\n | > Epoch {}/{}".format(epoch, c.epochs))
|
val_loss = evaluate(model, criterion, val_loader, current_step)
|
||||||
progbar = Progbar(len(dataset) / c.batch_size)
|
best_loss = save_best_model(model, optimizer, val_loss,
|
||||||
|
|
||||||
for num_iter, data in enumerate(dataloader):
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
text_input = data[0]
|
|
||||||
text_lengths = data[1]
|
|
||||||
linear_input = data[2]
|
|
||||||
mel_input = data[3]
|
|
||||||
|
|
||||||
current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1
|
|
||||||
|
|
||||||
# setup lr
|
|
||||||
current_lr = lr_decay(c.lr, current_step, c.warmup_steps)
|
|
||||||
for params_group in optimizer.param_groups:
|
|
||||||
params_group['lr'] = current_lr
|
|
||||||
|
|
||||||
optimizer.zero_grad()
|
|
||||||
|
|
||||||
# Add a single frame of zeros to Mel Specs for better end detection
|
|
||||||
#try:
|
|
||||||
# mel_input = np.concatenate((np.zeros(
|
|
||||||
# [c.batch_size, 1, c.num_mels], dtype=np.float32),
|
|
||||||
# mel_input[:, 1:, :]), axis=1)
|
|
||||||
#except:
|
|
||||||
# raise TypeError("not same dimension")
|
|
||||||
|
|
||||||
# convert inputs to variables
|
|
||||||
text_input_var = Variable(text_input)
|
|
||||||
mel_spec_var = Variable(mel_input)
|
|
||||||
linear_spec_var = Variable(linear_input, volatile=True)
|
|
||||||
|
|
||||||
# sort sequence by length.
|
|
||||||
# TODO: might be unnecessary
|
|
||||||
sorted_lengths, indices = torch.sort(
|
|
||||||
text_lengths.view(-1), dim=0, descending=True)
|
|
||||||
sorted_lengths = sorted_lengths.long().numpy()
|
|
||||||
|
|
||||||
text_input_var = text_input_var[indices]
|
|
||||||
mel_spec_var = mel_spec_var[indices]
|
|
||||||
linear_spec_var = linear_spec_var[indices]
|
|
||||||
|
|
||||||
if use_cuda:
|
|
||||||
text_input_var = text_input_var.cuda()
|
|
||||||
mel_spec_var = mel_spec_var.cuda()
|
|
||||||
linear_spec_var = linear_spec_var.cuda()
|
|
||||||
|
|
||||||
mel_output, linear_output, alignments =\
|
|
||||||
model.forward(text_input_var, mel_spec_var,
|
|
||||||
input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths)))
|
|
||||||
|
|
||||||
mel_loss = criterion(mel_output, mel_spec_var)
|
|
||||||
#linear_loss = torch.abs(linear_output - linear_spec_var)
|
|
||||||
#linear_loss = 0.5 * \
|
|
||||||
#torch.mean(linear_loss) + 0.5 * \
|
|
||||||
#torch.mean(linear_loss[:, :n_priority_freq, :])
|
|
||||||
linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
|
|
||||||
+ 0.5 * criterion(linear_output[:, :, :n_priority_freq],
|
|
||||||
linear_spec_var[: ,: ,:n_priority_freq])
|
|
||||||
loss = mel_loss + linear_loss
|
|
||||||
|
|
||||||
loss.backward()
|
|
||||||
grad_norm, skip_flag = check_update(model, 0.5, 100)
|
|
||||||
if skip_flag:
|
|
||||||
optimizer.zero_grad()
|
|
||||||
print(" | > Iteration skipped!!")
|
|
||||||
continue
|
|
||||||
optimizer.step()
|
|
||||||
|
|
||||||
step_time = time.time() - start_time
|
|
||||||
epoch_time += step_time
|
|
||||||
|
|
||||||
progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
|
|
||||||
('linear_loss', linear_loss.data[0]),
|
|
||||||
('mel_loss', mel_loss.data[0]),
|
|
||||||
('grad_norm', grad_norm)])
|
|
||||||
|
|
||||||
# Plot Learning Stats
|
|
||||||
tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step)
|
|
||||||
tb.add_scalar('Loss/LinearLoss', linear_loss.data[0],
|
|
||||||
current_step)
|
|
||||||
tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step)
|
|
||||||
tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
|
|
||||||
current_step)
|
|
||||||
tb.add_scalar('Params/GradNorm', grad_norm, current_step)
|
|
||||||
tb.add_scalar('Time/StepTime', step_time, current_step)
|
|
||||||
|
|
||||||
align_img = alignments[0].data.cpu().numpy()
|
|
||||||
align_img = plot_alignment(align_img)
|
|
||||||
tb.add_image('Attn/Alignment', align_img, current_step)
|
|
||||||
|
|
||||||
if current_step % c.save_step == 0:
|
|
||||||
|
|
||||||
if c.checkpoint:
|
|
||||||
# save model
|
|
||||||
save_checkpoint(model, optimizer, linear_loss.data[0],
|
|
||||||
OUT_PATH, current_step, epoch)
|
|
||||||
|
|
||||||
# Diagnostic visualizations
|
|
||||||
const_spec = linear_output[0].data.cpu().numpy()
|
|
||||||
gt_spec = linear_spec_var[0].data.cpu().numpy()
|
|
||||||
|
|
||||||
const_spec = plot_spectrogram(const_spec, dataset.ap)
|
|
||||||
gt_spec = plot_spectrogram(gt_spec, dataset.ap)
|
|
||||||
tb.add_image('Spec/Reconstruction', const_spec, current_step)
|
|
||||||
tb.add_image('Spec/GroundTruth', gt_spec, current_step)
|
|
||||||
|
|
||||||
align_img = alignments[0].data.cpu().numpy()
|
|
||||||
align_img = plot_alignment(align_img)
|
|
||||||
tb.add_image('Attn/Alignment', align_img, current_step)
|
|
||||||
|
|
||||||
# Sample audio
|
|
||||||
audio_signal = linear_output[0].data.cpu().numpy()
|
|
||||||
dataset.ap.griffin_lim_iters = 60
|
|
||||||
audio_signal = dataset.ap.inv_spectrogram(audio_signal.T)
|
|
||||||
try:
|
|
||||||
tb.add_audio('SampleAudio', audio_signal, current_step,
|
|
||||||
sample_rate=c.sample_rate)
|
|
||||||
except:
|
|
||||||
print("\n > Error at audio signal on TB!!")
|
|
||||||
print(audio_signal.max())
|
|
||||||
print(audio_signal.min())
|
|
||||||
|
|
||||||
|
|
||||||
# average loss after the epoch
|
|
||||||
avg_epoch_loss = np.mean(
|
|
||||||
progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1]))
|
|
||||||
best_loss = save_best_model(model, optimizer, avg_epoch_loss,
|
|
||||||
best_loss, OUT_PATH,
|
best_loss, OUT_PATH,
|
||||||
current_step, epoch)
|
current_step, epoch)
|
||||||
|
|
||||||
tb.add_scalar('Time/EpochTime', epoch_time, epoch)
|
|
||||||
epoch_time = 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
signal.signal(signal.SIGINT, signal_handler)
|
||||||
parser.add_argument('--restore_step', type=int,
|
|
||||||
help='Global step to restore checkpoint', default=0)
|
|
||||||
parser.add_argument('--restore_path', type=str,
|
|
||||||
help='Folder path to checkpoints', default=0)
|
|
||||||
parser.add_argument('--config_path', type=str,
|
|
||||||
help='path to config file for training',)
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|
Loading…
Reference in New Issue