From d7592621d14ae91ad945e1a2eb62946173befc15 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 2 Mar 2018 07:54:35 -0800 Subject: [PATCH] split train and validation steps --- config.json | 5 +- datasets/LJSpeech.py | 13 +- train.py | 483 ++++++++++++++++++++++++++----------------- 3 files changed, 297 insertions(+), 204 deletions(-) diff --git a/config.json b/config.json index d9ffc5c8..cd3aef72 100644 --- a/config.json +++ b/config.json @@ -20,11 +20,10 @@ "griffin_lim_iters": 60, "power": 1.5, - "num_loader_workers": 32, + "num_loader_workers": 16, "checkpoint": false, "save_step": 69, - "data_path": "/data/shared/KeithIto/LJSpeech-1.0", + "data_path": "/run/shm/erogol/LJSpeech-1.0", "output_path": "result", - "log_dir": "/home/erogol/projects/TTS/logs/" } diff --git a/datasets/LJSpeech.py b/datasets/LJSpeech.py index 81c2c9e9..ded16ed5 100644 --- a/datasets/LJSpeech.py +++ b/datasets/LJSpeech.py @@ -16,16 +16,15 @@ class LJSpeechDataset(Dataset): text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power): - f = open(csv_file, "r") - self.frames = [line.split('|') for line in f] - f.close() + with open(csv_file, "r") as f: + self.frames = [line.split('|') for line in f] + self.frames = self.frames[:256] self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, - frame_length_ms, preemphasis, ref_level_db, num_freq, power - ) + frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) @@ -41,11 +40,11 @@ class LJSpeechDataset(Dataset): def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, - self.frames.ix[idx, 0]) + '.wav' + self.frames[idx][0]) + '.wav' text = self.frames[idx][1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) - sample = {'text': text, 'wav': wav, 'item_idx': self.frames.ix[idx, 0]} + sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample def get_dummy_data(self): diff --git a/train.py b/train.py index 09c0e402..c806f965 100644 --- a/train.py +++ b/train.py @@ -27,36 +27,265 @@ from utils.visual import plot_alignment, plot_spectrogram from datasets.LJSpeech import LJSpeechDataset from models.tacotron import Tacotron + use_cuda = torch.cuda.is_available() +parser = argparse.ArgumentParser() +parser.add_argument('--restore_step', type=int, + help='Global step to restore checkpoint', default=0) +parser.add_argument('--restore_path', type=str, + help='Folder path to checkpoints', default=0) +parser.add_argument('--config_path', type=str, + help='path to config file for training',) +args = parser.parse_args() + +# setup output paths and read configs +c = load_config(args.config_path) +_ = os.path.dirname(os.path.realpath(__file__)) +OUT_PATH = os.path.join(_, c.output_path) +OUT_PATH = create_experiment_folder(OUT_PATH) +CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') +shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) + +# save config to tmp place to be loaded by subsequent modules. +file_name = str(os.getpid()) +tmp_path = os.path.join("/tmp/", file_name+'_tts') +pickle.dump(c, open(tmp_path, "wb")) + +# setup tensorboard +LOG_DIR = OUT_PATH +tb = SummaryWriter(LOG_DIR) + + +def signal_handler(signal, frame): + """Ctrl+C handler to remove empty experiment folder""" + print(" !! Pressed Ctrl+C !!") + remove_experiment_folder(OUT_PATH) + sys.exit(1) + + +def train(model, criterion, data_loader, optimizer, epoch): + model = model.train() + epoch_time = 0 + + print(" | > Epoch {}/{}".format(epoch, c.epochs)) + progbar = Progbar(len(data_loader.dataset) / c.batch_size) + n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) + for num_iter, data in enumerate(data_loader): + start_time = time.time() + + # setup input data + text_input = data[0] + text_lengths = data[1] + linear_input = data[2] + mel_input = data[3] + + current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1 + + # setup lr + current_lr = lr_decay(c.lr, current_step, c.warmup_steps) + for params_group in optimizer.param_groups: + params_group['lr'] = current_lr + + optimizer.zero_grad() + + # convert inputs to variables + text_input_var = Variable(text_input) + mel_spec_var = Variable(mel_input) + linear_spec_var = Variable(linear_input, volatile=True) + + # sort sequence by length for curriculum learning + # TODO: might be unnecessary + sorted_lengths, indices = torch.sort( + text_lengths.view(-1), dim=0, descending=True) + sorted_lengths = sorted_lengths.long().numpy() + text_input_var = text_input_var[indices] + mel_spec_var = mel_spec_var[indices] + linear_spec_var = linear_spec_var[indices] + + # dispatch data to GPU + if use_cuda: + text_input_var = text_input_var.cuda() + mel_spec_var = mel_spec_var.cuda() + linear_spec_var = linear_spec_var.cuda() + + # forward pass + mel_output, linear_output, alignments =\ + model.forward(text_input_var, mel_spec_var, + input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths))) + + # loss computation + mel_loss = criterion(mel_output, mel_spec_var) + linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + + 0.5 * criterion(linear_output[:, :, :n_priority_freq], + linear_spec_var[: ,: ,:n_priority_freq]) + loss = mel_loss + linear_loss + + # backpass and check the grad norm + loss.backward() + grad_norm, skip_flag = check_update(model, 0.5, 100) + if skip_flag: + optimizer.zero_grad() + print(" | > Iteration skipped!!") + continue + optimizer.step() + + step_time = time.time() - start_time + epoch_time += step_time + + # update + progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), + ('linear_loss', linear_loss.data[0]), + ('mel_loss', mel_loss.data[0]), + ('grad_norm', grad_norm)]) + + # Plot Training Iter Stats + tb.add_scalar('TrainIterLoss/TotalLoss', loss.data[0], current_step) + tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.data[0], + current_step) + tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.data[0], current_step) + tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], + current_step) + tb.add_scalar('Params/GradNorm', grad_norm, current_step) + tb.add_scalar('Time/StepTime', step_time, current_step) + + if current_step % c.save_step == 0: + if c.checkpoint: + # save model + save_checkpoint(model, optimizer, linear_loss.data[0], + OUT_PATH, current_step, epoch) + + # Diagnostic visualizations + const_spec = linear_output[0].data.cpu().numpy() + gt_spec = linear_spec_var[0].data.cpu().numpy() + + const_spec = plot_spectrogram(const_spec, dataset.ap) + gt_spec = plot_spectrogram(gt_spec, dataset.ap) + tb.add_image('Visual/Reconstruction', const_spec, current_step) + tb.add_image('Visual/GroundTruth', gt_spec, current_step) + + align_img = alignments[0].data.cpu().numpy() + align_img = plot_alignment(align_img) + tb.add_image('Visual/Alignment', align_img, current_step) + + # Sample audio + audio_signal = linear_output[0].data.cpu().numpy() + dataset.ap.griffin_lim_iters = 60 + audio_signal = dataset.ap.inv_spectrogram(audio_signal.T) + try: + tb.add_audio('SampleAudio', audio_signal, current_step, + sample_rate=c.sample_rate) + except: + print("\n > Error at audio signal on TB!!") + print(audio_signal.max()) + print(audio_signal.min()) + + avg_linear_loss = np.mean( + progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1])) + avg_mel_loss = np.mean( + progbar.sum_values['mel_loss'][0] / max(1, progbar.sum_values['mel_loss'][1])) + avg_total_loss = avg_mel_loss + avg_linear_loss + + # Plot Training Epoch Stats + tb.add_scalar('TrainEpochLoss/TotalLoss', loss.data[0], current_step) + tb.add_scalar('TrainEpochLoss/LinearLoss', linear_loss.data[0], current_step) + tb.add_scalar('TrainEpochLoss/MelLoss', mel_loss.data[0], current_step) + tb.add_scalar('Time/EpochTime', epoch_time, epoch) + epoch_time = 0 + + return avg_linear_loss, current_step + + +def evaluate(model, criterion, data_loader, current_step): + model = model.train() + epoch_time = 0 + + print("\n | > Validation") + n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) + progbar = Progbar(len(data_loader.dataset) / c.batch_size) + + for num_iter, data in enumerate(data_loader): + start_time = time.time() + + # setup input data + text_input = data[0] + text_lengths = data[1] + linear_input = data[2] + mel_input = data[3] + + # convert inputs to variables + text_input_var = Variable(text_input) + mel_spec_var = Variable(mel_input) + linear_spec_var = Variable(linear_input, volatile=True) + + # dispatch data to GPU + if use_cuda: + text_input_var = text_input_var.cuda() + mel_spec_var = mel_spec_var.cuda() + linear_spec_var = linear_spec_var.cuda() + + # forward pass + mel_output, linear_output, alignments =\ + model.forward(text_input_var, mel_spec_var) + + # loss computation + mel_loss = criterion(mel_output, mel_spec_var) + linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + + 0.5 * criterion(linear_output[:, :, :n_priority_freq], + linear_spec_var[: ,: ,:n_priority_freq]) + loss = mel_loss + linear_loss + + step_time = time.time() - start_time + epoch_time += step_time + + # update + progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), + ('linear_loss', linear_loss.data[0]), + ('mel_loss', mel_loss.data[0])]) + + # Diagnostic visualizations + idx = np.random.randint(c.batch_size) + const_spec = linear_output[idx].data.cpu().numpy() + gt_spec = linear_spec_var[idx].data.cpu().numpy() + + const_spec = plot_spectrogram(const_spec, data_loader.dataset.ap) + gt_spec = plot_spectrogram(gt_spec, data_loader.dataset.ap) + tb.add_image('ValVisual/Reconstruction', const_spec, current_step) + tb.add_image('ValVisual/GroundTruth', gt_spec, current_step) + + align_img = alignments[idx].data.cpu().numpy() + align_img = plot_alignment(align_img) + tb.add_image('ValVisual/ValidationAlignment', align_img, current_step) + + # Sample audio + audio_signal = linear_output[idx].data.cpu().numpy() + data_loader.dataset.ap.griffin_lim_iters = 60 + audio_signal = data_loader.dataset.ap.inv_spectrogram(audio_signal.T) + try: + tb.add_audio('ValSampleAudio', audio_signal, current_step, + sample_rate=c.sample_rate) + except: + print("\n > Error at audio signal on TB!!") + print(audio_signal.max()) + print(audio_signal.min()) + + # compute average losses + avg_linear_loss = np.mean( + progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1])) + avg_mel_loss = np.mean( + progbar.sum_values['mel_loss'][0] / max(1, progbar.sum_values['mel_loss'][1])) + avg_total_loss = avg_mel_loss + avg_linear_loss + + # Plot Learning Stats + tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step) + tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step) + tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step) + return avg_linear_loss + + def main(args): - # setup output paths and read configs - c = load_config(args.config_path) - _ = os.path.dirname(os.path.realpath(__file__)) - OUT_PATH = os.path.join(_, c.output_path) - OUT_PATH = create_experiment_folder(OUT_PATH) - CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') - shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) - - # save config to tmp place to be loaded by subsequent modules. - file_name = str(os.getpid()) - tmp_path = os.path.join("/tmp/", file_name+'_tts') - pickle.dump(c, open(tmp_path, "wb")) - - # setup tensorboard - LOG_DIR = OUT_PATH - tb = SummaryWriter(LOG_DIR) - - # Ctrl+C handler to remove empty experiment folder - def signal_handler(signal, frame): - print(" !! Pressed Ctrl+C !!") - remove_experiment_folder(OUT_PATH) - sys.exit(1) - signal.signal(signal.SIGINT, signal_handler) - - # Setup the dataset - dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), + train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, @@ -71,27 +300,42 @@ def main(args): c.power ) - dataloader = DataLoader(dataset, batch_size=c.batch_size, - shuffle=True, collate_fn=dataset.collate_fn, + train_loader = DataLoader(train_dataset, batch_size=c.batch_size, + shuffle=True, collate_fn=train_dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers, pin_memory=True) + + val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'), + os.path.join(c.data_path, 'wavs'), + c.r, + c.sample_rate, + c.text_cleaner, + c.num_mels, + c.min_level_db, + c.frame_shift_ms, + c.frame_length_ms, + c.preemphasis, + c.ref_level_db, + c.num_freq, + c.power + ) + + val_loader = DataLoader(val_dataset, batch_size=c.batch_size, + shuffle=True, collate_fn=val_dataset.collate_fn, + drop_last=True, num_workers= 4, + pin_memory=True) - # setup the model - model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r) - # plot model on tensorboard - dummy_input = dataset.get_dummy_data() - - ## TODO: onnx does not support RNN fully yet - # model_proto_path = os.path.join(OUT_PATH, "model.proto") - # onnx.export(model, dummy_input, model_proto_path, verbose=True) - # tb.add_graph_onnx(model_proto_path) - optimizer = optim.Adam(model.parameters(), lr=c.lr) + + if use_cuda: + criterion = nn.L1Loss().cuda() + else: + criterion = nn.L1Loss() if args.restore_step: checkpoint = torch.load(os.path.join( @@ -118,169 +362,20 @@ def main(args): num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) - - model = model.train() - + if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) - - if use_cuda: - criterion = nn.L1Loss().cuda() - else: - criterion = nn.L1Loss() - - n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) - - #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay, - # patience=c.lr_patience, verbose=True) - epoch_time = 0 + if 'best_loss' not in locals(): best_loss = float('inf') + for epoch in range(0, c.epochs): - - print("\n | > Epoch {}/{}".format(epoch, c.epochs)) - progbar = Progbar(len(dataset) / c.batch_size) - - for num_iter, data in enumerate(dataloader): - start_time = time.time() - - text_input = data[0] - text_lengths = data[1] - linear_input = data[2] - mel_input = data[3] - - current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1 - - # setup lr - current_lr = lr_decay(c.lr, current_step, c.warmup_steps) - for params_group in optimizer.param_groups: - params_group['lr'] = current_lr - - optimizer.zero_grad() - - # Add a single frame of zeros to Mel Specs for better end detection - #try: - # mel_input = np.concatenate((np.zeros( - # [c.batch_size, 1, c.num_mels], dtype=np.float32), - # mel_input[:, 1:, :]), axis=1) - #except: - # raise TypeError("not same dimension") - - # convert inputs to variables - text_input_var = Variable(text_input) - mel_spec_var = Variable(mel_input) - linear_spec_var = Variable(linear_input, volatile=True) - - # sort sequence by length. - # TODO: might be unnecessary - sorted_lengths, indices = torch.sort( - text_lengths.view(-1), dim=0, descending=True) - sorted_lengths = sorted_lengths.long().numpy() - - text_input_var = text_input_var[indices] - mel_spec_var = mel_spec_var[indices] - linear_spec_var = linear_spec_var[indices] - - if use_cuda: - text_input_var = text_input_var.cuda() - mel_spec_var = mel_spec_var.cuda() - linear_spec_var = linear_spec_var.cuda() - - mel_output, linear_output, alignments =\ - model.forward(text_input_var, mel_spec_var, - input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths))) - - mel_loss = criterion(mel_output, mel_spec_var) - #linear_loss = torch.abs(linear_output - linear_spec_var) - #linear_loss = 0.5 * \ - #torch.mean(linear_loss) + 0.5 * \ - #torch.mean(linear_loss[:, :n_priority_freq, :]) - linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ - + 0.5 * criterion(linear_output[:, :, :n_priority_freq], - linear_spec_var[: ,: ,:n_priority_freq]) - loss = mel_loss + linear_loss - - loss.backward() - grad_norm, skip_flag = check_update(model, 0.5, 100) - if skip_flag: - optimizer.zero_grad() - print(" | > Iteration skipped!!") - continue - optimizer.step() - - step_time = time.time() - start_time - epoch_time += step_time - - progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), - ('linear_loss', linear_loss.data[0]), - ('mel_loss', mel_loss.data[0]), - ('grad_norm', grad_norm)]) - - # Plot Learning Stats - tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step) - tb.add_scalar('Loss/LinearLoss', linear_loss.data[0], - current_step) - tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step) - tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], - current_step) - tb.add_scalar('Params/GradNorm', grad_norm, current_step) - tb.add_scalar('Time/StepTime', step_time, current_step) - - align_img = alignments[0].data.cpu().numpy() - align_img = plot_alignment(align_img) - tb.add_image('Attn/Alignment', align_img, current_step) - - if current_step % c.save_step == 0: - - if c.checkpoint: - # save model - save_checkpoint(model, optimizer, linear_loss.data[0], - OUT_PATH, current_step, epoch) - - # Diagnostic visualizations - const_spec = linear_output[0].data.cpu().numpy() - gt_spec = linear_spec_var[0].data.cpu().numpy() - - const_spec = plot_spectrogram(const_spec, dataset.ap) - gt_spec = plot_spectrogram(gt_spec, dataset.ap) - tb.add_image('Spec/Reconstruction', const_spec, current_step) - tb.add_image('Spec/GroundTruth', gt_spec, current_step) - - align_img = alignments[0].data.cpu().numpy() - align_img = plot_alignment(align_img) - tb.add_image('Attn/Alignment', align_img, current_step) - - # Sample audio - audio_signal = linear_output[0].data.cpu().numpy() - dataset.ap.griffin_lim_iters = 60 - audio_signal = dataset.ap.inv_spectrogram(audio_signal.T) - try: - tb.add_audio('SampleAudio', audio_signal, current_step, - sample_rate=c.sample_rate) - except: - print("\n > Error at audio signal on TB!!") - print(audio_signal.max()) - print(audio_signal.min()) - - - # average loss after the epoch - avg_epoch_loss = np.mean( - progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1])) - best_loss = save_best_model(model, optimizer, avg_epoch_loss, + train_loss, current_step = train(model, criterion, train_loader, optimizer, epoch) + val_loss = evaluate(model, criterion, val_loader, current_step) + best_loss = save_best_model(model, optimizer, val_loss, best_loss, OUT_PATH, current_step, epoch) - tb.add_scalar('Time/EpochTime', epoch_time, epoch) - epoch_time = 0 - - if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--restore_step', type=int, - help='Global step to restore checkpoint', default=0) - parser.add_argument('--restore_path', type=str, - help='Folder path to checkpoints', default=0) - parser.add_argument('--config_path', type=str, - help='path to config file for training',) - args = parser.parse_args() + signal.signal(signal.SIGINT, signal_handler) main(args)