From 3472a41255f02a9ac367e617f94183bc1811f623 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 10 Mar 2020 18:17:35 +0100 Subject: [PATCH 1/2] make it optional to load linear specs in dataloader and fix tests respectively --- datasets/TTSDataset.py | 18 ++++++++++++------ tests/test_loader.py | 1 + train.py | 1 + 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index d3a6f486..ae75f3cf 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -13,6 +13,7 @@ class MyDataset(Dataset): def __init__(self, outputs_per_step, text_cleaner, + compute_linear_spec, ap, meta_data, tp=None, @@ -28,6 +29,7 @@ class MyDataset(Dataset): Args: outputs_per_step (int): number of time frames predicted per step. text_cleaner (str): text cleaner used for the dataset. + compute_linear_spec (bool): compute linear spectrogram if True. ap (TTS.utils.AudioProcessor): audio processor object. meta_data (list): list of dataset instances. batch_group_size (int): (0) range of batch randomization after sorting @@ -47,6 +49,7 @@ class MyDataset(Dataset): self.outputs_per_step = outputs_per_step self.sample_rate = ap.sample_rate self.cleaners = text_cleaner + self.compute_linear_spec = compute_linear_spec self.min_seq_len = min_seq_len self.max_seq_len = max_seq_len self.ap = ap @@ -193,7 +196,6 @@ class MyDataset(Dataset): # compute features mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] - linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] for m in mel] @@ -208,25 +210,29 @@ class MyDataset(Dataset): # PAD sequences with longest instance in the batch text = prepare_data(text).astype(np.int32) - wav = prepare_data(wav) # PAD features with longest instance - linear = prepare_tensor(linear, self.outputs_per_step) mel = prepare_tensor(mel, self.outputs_per_step) - assert mel.shape[2] == linear.shape[2] # B x D x T --> B x T x D - linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) - linear = torch.FloatTensor(linear).contiguous() mel = torch.FloatTensor(mel).contiguous() mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) + # compute linear spectrogram + if self.compute_linear_spec: + linear = [self.ap.spectrogram(w).astype('float32') for w in wav] + linear = prepare_tensor(linear, self.outputs_per_step) + linear = linear.transpose(0, 2, 1) + assert mel.shape[1] == linear.shape[1] + linear = torch.FloatTensor(linear).contiguous() + else: + linear = None return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \ stop_targets, item_idxs diff --git a/tests/test_loader.py b/tests/test_loader.py index f2bec24c..447c7b38 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -36,6 +36,7 @@ class TestTTSDataset(unittest.TestCase): dataset = TTSDataset.MyDataset( r, c.text_cleaner, + compute_linear_spec=True, ap=self.ap, meta_data=items, tp=c.characters if 'characters' in c.keys() else None, diff --git a/train.py b/train.py index b3a0589b..15c65f64 100644 --- a/train.py +++ b/train.py @@ -47,6 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): dataset = MyDataset( r, c.text_cleaner, + compute_linear_spec=True if c.model.lower() is 'tacotron' else False meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, tp=c.characters if 'characters' in c.keys() else None, From 2a15e391669f9073ba10ef7ff20bb54ec5246977 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 10 Mar 2020 22:38:51 +0100 Subject: [PATCH 2/2] bug fix and run desc in tensorboard --- config.json | 4 ++-- train.py | 7 +++++-- utils/logger.py | 3 +++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/config.json b/config.json index e525ec31..efc96c9e 100644 --- a/config.json +++ b/config.json @@ -1,7 +1,7 @@ { "model": "Tacotron2", // one of the model in models/ - "run_name": "ljspeech-stft_params", - "run_description": "tacotron2 cosntant stf parameters", + "run_name": "ljspeech", + "run_description": "tacotron2 with guided attention and -1 1 normalization and no preemphasis", // AUDIO PARAMETERS "audio":{ diff --git a/train.py b/train.py index 15c65f64..ea6d391c 100644 --- a/train.py +++ b/train.py @@ -47,7 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): dataset = MyDataset( r, c.text_cleaner, - compute_linear_spec=True if c.model.lower() is 'tacotron' else False + compute_linear_spec=True if c.model.lower() is 'tacotron' else False, meta_data=meta_data_eval if is_val else meta_data_train, ap=ap, tp=c.characters if 'characters' in c.keys() else None, @@ -410,7 +410,7 @@ def evaluate(model, criterion, ap, global_step, epoch): loss_dict['ga_loss'].item(), keep_avg['avg_ga_loss'], align_score, keep_avg['avg_align_score']), - flush=Tr ue) + flush=True) if args.rank == 0: # Diagnostic visualizations @@ -696,6 +696,9 @@ if __name__ == '__main__': LOG_DIR = OUT_PATH tb_logger = Logger(LOG_DIR) + # write model desc to tensorboard + tb_logger.tb_add_text('model-description', c['run_description'], 0) + try: main(args) except KeyboardInterrupt: diff --git a/utils/logger.py b/utils/logger.py index 51a10422..e5faeda4 100644 --- a/utils/logger.py +++ b/utils/logger.py @@ -75,3 +75,6 @@ class Logger(object): def tb_test_figures(self, step, figures): self.dict_to_tb_figure("TestFigures", figures, step) + + def tb_add_text(self, title, text, step): + self.writer.add_text(title, text, step)