diff --git a/config.json b/config.json index e19ea9de..fa72b9ab 100644 --- a/config.json +++ b/config.json @@ -103,6 +103,7 @@ // TENSORBOARD and LOGGING "print_step": 25, // Number of steps to log traning on console. + "print_eval": false, // If True, it prints loss values in evalulation. "save_step": 10000, // Number of training steps expected to save traninpg stats and checkpoints. "checkpoint": true, // If true, it saves checkpoints per "save_step" "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. diff --git a/train.py b/train.py index 1fc997ef..46e2c43f 100644 --- a/train.py +++ b/train.py @@ -20,8 +20,9 @@ from TTS.utils.generic_utils import ( get_git_branch, load_config, remove_experiment_folder, save_best_model, save_checkpoint, adam_weight_decay, set_init_dict, copy_config_file, setup_model, gradual_training_scheduler, KeepAverage, - set_weight_decay, check_config) -from TTS.utils.logger import Logger + set_weight_decay, check_config, print_train_step) +from TTS.utils.tensorboard_logger import TensorboardLogger +from TTS.utils.console_logger import ConsoleLogger from TTS.utils.speakers import load_speaker_mapping, save_speaker_mapping, \ get_speakers from TTS.utils.synthesis import synthesis @@ -125,8 +126,8 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, train_values = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, - 'avg_stop_loss': 0, - 'avg_align_score': 0, + 'avg_stopnet_loss': 0, + 'avg_align_error': 0, 'avg_step_time': 0, 'avg_loader_time': 0, 'avg_alignment_score': 0 @@ -138,13 +139,13 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, train_values['avg_ga_loss'] = 0 # guidede attention loss keep_avg = KeepAverage() keep_avg.add_values(train_values) - print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) if use_cuda: batch_n_iter = int( len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() + c_logger.print_train_start() for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -193,9 +194,10 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, grad_norm, grad_flag = check_update(model, c.grad_clip, ignore_stopnet=True) optimizer.step() - # compute alignment score - align_score = alignment_diagonal_score(alignments) - keep_avg.update_value('avg_align_score', align_score) + # compute alignment error (the lower the better ) + align_error = 1 - alignment_diagonal_score(alignments) + keep_avg.update_value('avg_align_error', align_error) + loss_dict['align_error'] = align_error # backpass and check the grad norm for stop loss if c.separate_stopnet: @@ -209,17 +211,22 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, step_time = time.time() - start_time epoch_time += step_time + # update avg stats + update_train_values = { + 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), + 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), + 'avg_stopnet_loss': loss_dict['stopnet_loss'].item() + if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()), + 'avg_step_time': step_time, + 'avg_loader_time': loader_time + } + keep_avg.update_values(update_train_values) + if global_step % c.print_step == 0: - print( - " | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} " - "DecoderLoss:{:.5f} StopLoss:{:.5f} GALoss:{:.5f} GradNorm:{:.5f} " - "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " - "LoaderTime:{:.2f} LR:{:.6f}".format( - num_iter, batch_n_iter, global_step, loss_dict['postnet_loss'].item(), - loss_dict['decoder_loss'].item(), loss_dict['stopnet_loss'].item(), - loss_dict['ga_loss'].item(), grad_norm, grad_norm_st, avg_text_length, - avg_spec_length, step_time, loader_time, current_lr), - flush=True) + c_logger.print_train_step(batch_n_iter, num_iter, global_step, + avg_spec_length, avg_text_length, + step_time, loader_time, current_lr, + loss_dict, keep_avg.avg_values) # aggregate losses from processes if num_gpus > 1: @@ -230,16 +237,6 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, num_gpus) if c.stopnet else loss_dict['stopnet_loss'] if args.rank == 0: - update_train_values = { - 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), - 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), - 'avg_stop_loss': loss_dict['stopnet_loss'].item() - if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()), - 'avg_step_time': step_time, - 'avg_loader_time': loader_time - } - keep_avg.update_values(update_train_values) - # Plot Training Iter Stats # reduce TB load if global_step % 10 == 0: @@ -289,23 +286,16 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, end_time = time.time() # print epoch stats - print(" | > EPOCH END -- GlobalStep:{} " - "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " - "AvgStopLoss:{:.5f} AvgGALoss:{:3f} EpochTime:{:.2f} " - "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format( - global_step, keep_avg['avg_postnet_loss'], - keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'], - keep_avg['avg_ga_loss'], epoch_time, - keep_avg['avg_step_time'], keep_avg['avg_loader_time']), - flush=True) + c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) + # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], - "stop_loss": keep_avg['avg_stop_loss'], - "alignment_score": keep_avg['avg_align_score'], + "stopnet_loss": keep_avg['avg_stopnet_loss'], + "alignment_score": keep_avg['avg_align_error'], "epoch_time": epoch_time } if c.ga_alpha > 0: @@ -313,7 +303,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler, tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) - return keep_avg['avg_postnet_loss'], global_step + return keep_avg.avg_values, global_step @torch.no_grad() @@ -326,8 +316,8 @@ def evaluate(model, criterion, ap, global_step, epoch): eval_values_dict = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, - 'avg_stop_loss': 0, - 'avg_align_score': 0 + 'avg_stopnet_loss': 0, + 'avg_align_error': 0 } if c.bidirectional_decoder: eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss @@ -336,8 +326,8 @@ def evaluate(model, criterion, ap, global_step, epoch): eval_values_dict['avg_ga_loss'] = 0 # guidede attention loss keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) - print("\n > Validation") + c_logger.print_eval_start() if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() @@ -377,40 +367,27 @@ def evaluate(model, criterion, ap, global_step, epoch): epoch_time += step_time # compute alignment score - align_score = alignment_diagonal_score(alignments) - keep_avg.update_value('avg_align_score', align_score) + align_error = 1 - alignment_diagonal_score(alignments) + keep_avg.update_value('avg_align_error', align_error) # aggregate losses from processes if num_gpus > 1: - postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) - decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) + postnet_loss = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus) + decoder_loss = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus) if c.stopnet: - stop_loss = reduce_tensor(stop_loss.data, num_gpus) + stopnet_loss = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) keep_avg.update_values({ 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), - 'avg_stop_loss': + 'avg_stopnet_loss': float(loss_dict['stopnet_loss'].item()), }) - if num_iter % c.print_step == 0: - print( - " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " - "StopLoss: {:.5f} - {:.5f} GALoss: {:.5f} - {:.5f} AlignScore: {:.4f} - {:.4f}" - .format(loss_dict['loss'].item(), - loss_dict['postnet_loss'].item(), - keep_avg['avg_postnet_loss'], - loss_dict['decoder_loss'].item(), - keep_avg['avg_decoder_loss'], - loss_dict['stopnet_loss'].item(), - keep_avg['avg_stop_loss'], - loss_dict['ga_loss'].item(), - keep_avg['avg_ga_loss'], - align_score, keep_avg['avg_align_score']), - flush=True) + if c.print_eval: + c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) if args.rank == 0: # Diagnostic visualizations @@ -439,8 +416,8 @@ def evaluate(model, criterion, ap, global_step, epoch): epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], - "stop_loss": keep_avg['avg_stop_loss'], - "alignment_score": keep_avg['avg_align_score'], + "stopnet_loss": keep_avg['avg_stopnet_loss'], + "alignment_score": keep_avg['avg_align_error'], } if c.bidirectional_decoder: @@ -501,7 +478,7 @@ def evaluate(model, criterion, ap, global_step, epoch): tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) - return keep_avg['avg_postnet_loss'] + return keep_avg.avg_values # FIXME: move args definition/parsing inside of main? @@ -603,6 +580,7 @@ def main(args): # pylint: disable=redefined-outer-name global_step = args.restore_step for epoch in range(0, c.epochs): + c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) @@ -610,18 +588,16 @@ def main(args): # pylint: disable=redefined-outer-name model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) - print(" > Number of outputs per iteration:", model.decoder.r) + print("\n > Number of output frames:", model.decoder.r) - train_loss, global_step = train(model, criterion, optimizer, + train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch) - val_loss = evaluate(model, criterion, ap, global_step, epoch) - print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( - train_loss, val_loss), - flush=True) - target_loss = train_loss + eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) + c_logger.print_epoch_end(epoch, eval_avg_loss_dict) + target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: - target_loss = val_loss + target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model(model, optimizer, target_loss, best_loss, OUT_PATH, global_step, epoch) @@ -681,6 +657,8 @@ if __name__ == '__main__': AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') + c_logger = ConsoleLogger() + if args.rank == 0: os.makedirs(AUDIO_PATH, exist_ok=True) new_fields = {} @@ -693,7 +671,7 @@ if __name__ == '__main__': os.chmod(OUT_PATH, 0o775) LOG_DIR = OUT_PATH - tb_logger = Logger(LOG_DIR) + tb_logger = TensorboardLogger(LOG_DIR) # write model desc to tensorboard tb_logger.tb_add_text('model-description', c['run_description'], 0) diff --git a/utils/console_logger.py b/utils/console_logger.py new file mode 100644 index 00000000..fad963fd --- /dev/null +++ b/utils/console_logger.py @@ -0,0 +1,88 @@ +import datetime +from TTS.utils.generic_utils import AttrDict + + +tcolors = AttrDict({ + 'OKBLUE': '\033[94m', + 'HEADER': '\033[95m', + 'OKGREEN': '\033[92m', + 'WARNING': '\033[93m', + 'FAIL': '\033[91m', + 'ENDC': '\033[0m', + 'BOLD': '\033[1m', + 'UNDERLINE': '\033[4m' +}) + + +class ConsoleLogger(): + def __init__(self): + # TODO: color code for value changes + # use these to compare values between iterations + self.old_train_loss_dict = None + self.old_epoch_loss_dict = None + self.old_eval_loss_dict = None + + def get_time(self): + now = datetime.datetime.now() + return now.strftime("%Y-%m-%d %H:%M:%S") + + def print_epoch_start(self, epoch, max_epoch): + print("\n{}{} > EPOCH: {}/{}{}".format(tcolors.UNDERLINE, tcolors.BOLD, + epoch, max_epoch, tcolors.ENDC), + flush=True) + + def print_train_start(self): + print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}") + + def print_train_step(self, batch_steps, step, global_step, avg_spec_length, + avg_text_length, step_time, loader_time, lr, + loss_dict, avg_loss_dict): + indent = " | > " + print() + log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format( + tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC) + for key, value in loss_dict.items(): + # print the avg value if given + if f'avg_{key}' in avg_loss_dict.keys(): + log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) + else: + log_text += "{}{}: {:.5f} \n".format(indent, key, value) + log_text += "{}avg_spec_len: {}\n{}avg_text_len: {}\n{}step_time: {:.2f}\n{}loader_time: {:.2f}\n{}lr: {:.5f}"\ + .format(indent, avg_spec_length, indent, avg_text_length, indent, step_time, indent, loader_time, indent, lr) + print(log_text, flush=True) + + def print_train_epoch_end(self, global_step, epoch, epoch_time, + print_dict): + indent = " | > " + log_text = f"\n{tcolors.BOLD} --> TRAIN PERFORMACE -- EPOCH TIME: {epoch} sec -- GLOBAL_STEP: {global_step}{tcolors.ENDC}\n" + for key, value in print_dict.items(): + log_text += "{}{}: {:.5f}\n".format(indent, key, value) + print(log_text, flush=True) + + def print_eval_start(self): + print(f"{tcolors.BOLD} > EVALUATION {tcolors.ENDC}\n") + + def print_eval_step(self, step, loss_dict, avg_loss_dict): + indent = " | > " + log_text = f"{tcolors.BOLD} --> STEP: {step}{tcolors.ENDC}\n" + for key, value in loss_dict.items(): + # print the avg value if given + if f'avg_{key}' in avg_loss_dict.keys(): + log_text += "{}{}: {:.5f} ({:.5f})\n".format(indent, key, value, avg_loss_dict[f'avg_{key}']) + else: + log_text += "{}{}: {:.5f} \n".format(indent, key, value) + print(log_text, flush=True) + + def print_epoch_end(self, epoch, avg_loss_dict): + indent = " | > " + log_text = " {}--> EVAL PERFORMANCE{}\n".format( + tcolors.BOLD, tcolors.ENDC) + for key, value in avg_loss_dict.items(): + # print the avg value if given + color = tcolors.OKGREEN + if self.old_eval_loss_dict is not None: + if self.old_eval_loss_dict[key] > value: + color = tcolors.FAIL + log_text += "{}{}:{} {:.5f} \n{}".format(indent, key, color, value, tcolors.ENDC) + self.old_eval_loss_dict = avg_loss_dict + print(log_text, flush=True) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 9af0bb70..55f0b38f 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -144,7 +144,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, best_loss = model_loss bestmodel_path = 'best_model.pth.tar' bestmodel_path = os.path.join(out_path, bestmodel_path) - print("\n > BEST MODEL ({0:.5f}) : {1:}".format( + print(" > BEST MODEL ({0:.5f}) : {1:}".format( model_loss, bestmodel_path)) torch.save(state, bestmodel_path) return best_loss @@ -368,6 +368,9 @@ class KeepAverage(): def __getitem__(self, key): return self.avg_values[key] + def items(self): + return self.avg_values.items() + def add_value(self, name, init_val=0, init_iter=0): self.avg_values[name] = init_val self.iters[name] = init_iter @@ -407,6 +410,37 @@ def _check_argument(name, c, enum_list=None, max_val=None, min_val=None, restric assert isinstance(c[name], val_type) or c[name] is None, f' [!] {name} has wrong type - {type(c[name])} vs {val_type}' +tcolors = AttrDict({ + 'OKBLUE': '\033[94m', + 'HEADER': '\033[95m', + 'OKGREEN': '\033[92m', + 'WARNING': '\033[93m', + 'FAIL': '\033[91m', + 'ENDC': '\033[0m', + 'BOLD': '\033[1m', + 'UNDERLINE': '\033[4m' +}) + + +def print_train_step(batch_steps, step, global_step, avg_spec_length, avg_text_length, step_time, loader_time, lr, print_dict): + indent = " | > " + print() + log_text = "{} --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(tcolors.BOLD, step, batch_steps, global_step, tcolors.ENDC) + for key, value in print_dict.items(): + log_text += "{}{}: {:.5f}\n".format(indent, key, value) + log_text += "{}avg_spec_len: {}\n{}avg_text_len: {}\n{}step_time: {:.2f}\n{}loader_time: {:.2f}\n{}lr: {:.5f}"\ + .format(indent, avg_spec_length, indent, avg_text_length, indent, step_time, indent, loader_time, indent, lr) + print(log_text, flush=True) + + +def print_train_epoch(step, global_step, epoch, loss_dict): + pass + + +def print_eval_step(): + pass + + def check_config(c): _check_argument('model', c, enum_list=['tacotron', 'tacotron2'], restricted=True, val_type=str) _check_argument('run_name', c, restricted=True, val_type=str) diff --git a/utils/logger.py b/utils/tensorboard_logger.py similarity index 98% rename from utils/logger.py rename to utils/tensorboard_logger.py index e5faeda4..15fe04e4 100644 --- a/utils/logger.py +++ b/utils/tensorboard_logger.py @@ -2,7 +2,7 @@ import traceback from tensorboardX import SummaryWriter -class Logger(object): +class TensorboardLogger(object): def __init__(self, log_dir): self.writer = SummaryWriter(log_dir) self.train_stats = {}