From 3472a41255f02a9ac367e617f94183bc1811f623 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Tue, 10 Mar 2020 18:17:35 +0100
Subject: [PATCH 1/2] make it optional to load linear specs in dataloader and
 fix tests respectively

---
 datasets/TTSDataset.py | 18 ++++++++++++------
 tests/test_loader.py   |  1 +
 train.py               |  1 +
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py
index d3a6f486..ae75f3cf 100644
--- a/datasets/TTSDataset.py
+++ b/datasets/TTSDataset.py
@@ -13,6 +13,7 @@ class MyDataset(Dataset):
     def __init__(self,
                  outputs_per_step,
                  text_cleaner,
+                 compute_linear_spec,
                  ap,
                  meta_data,
                  tp=None,
@@ -28,6 +29,7 @@ class MyDataset(Dataset):
         Args:
             outputs_per_step (int): number of time frames predicted per step.
             text_cleaner (str): text cleaner used for the dataset.
+            compute_linear_spec (bool): compute linear spectrogram if True.
             ap (TTS.utils.AudioProcessor): audio processor object.
             meta_data (list): list of dataset instances.
             batch_group_size (int): (0) range of batch randomization after sorting
@@ -47,6 +49,7 @@ class MyDataset(Dataset):
         self.outputs_per_step = outputs_per_step
         self.sample_rate = ap.sample_rate
         self.cleaners = text_cleaner
+        self.compute_linear_spec = compute_linear_spec
         self.min_seq_len = min_seq_len
         self.max_seq_len = max_seq_len
         self.ap = ap
@@ -193,7 +196,6 @@ class MyDataset(Dataset):
 
             # compute features
             mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
-            linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
 
             mel_lengths = [m.shape[1] for m in mel]
 
@@ -208,25 +210,29 @@ class MyDataset(Dataset):
 
             # PAD sequences with longest instance in the batch
             text = prepare_data(text).astype(np.int32)
-            wav = prepare_data(wav)
 
             # PAD features with longest instance
-            linear = prepare_tensor(linear, self.outputs_per_step)
             mel = prepare_tensor(mel, self.outputs_per_step)
-            assert mel.shape[2] == linear.shape[2]
 
             # B x D x T --> B x T x D
-            linear = linear.transpose(0, 2, 1)
             mel = mel.transpose(0, 2, 1)
 
             # convert things to pytorch
             text_lenghts = torch.LongTensor(text_lenghts)
             text = torch.LongTensor(text)
-            linear = torch.FloatTensor(linear).contiguous()
             mel = torch.FloatTensor(mel).contiguous()
             mel_lengths = torch.LongTensor(mel_lengths)
             stop_targets = torch.FloatTensor(stop_targets)
 
+            # compute linear spectrogram 
+            if self.compute_linear_spec:
+                linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
+                linear = prepare_tensor(linear, self.outputs_per_step)
+                linear = linear.transpose(0, 2, 1)
+                assert mel.shape[1] == linear.shape[1]
+                linear = torch.FloatTensor(linear).contiguous()
+            else:
+                linear = None
             return text, text_lenghts, speaker_name, linear, mel, mel_lengths, \
                    stop_targets, item_idxs
 
diff --git a/tests/test_loader.py b/tests/test_loader.py
index f2bec24c..447c7b38 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -36,6 +36,7 @@ class TestTTSDataset(unittest.TestCase):
         dataset = TTSDataset.MyDataset(
             r,
             c.text_cleaner,
+            compute_linear_spec=True,
             ap=self.ap,
             meta_data=items,
             tp=c.characters if 'characters' in c.keys() else None,
diff --git a/train.py b/train.py
index b3a0589b..15c65f64 100644
--- a/train.py
+++ b/train.py
@@ -47,6 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
         dataset = MyDataset(
             r,
             c.text_cleaner,
+            compute_linear_spec=True if c.model.lower() is 'tacotron' else False
             meta_data=meta_data_eval if is_val else meta_data_train,
             ap=ap,
             tp=c.characters if 'characters' in c.keys() else None,

From 2a15e391669f9073ba10ef7ff20bb54ec5246977 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Tue, 10 Mar 2020 22:38:51 +0100
Subject: [PATCH 2/2] bug fix and run desc in tensorboard

---
 config.json     | 4 ++--
 train.py        | 7 +++++--
 utils/logger.py | 3 +++
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/config.json b/config.json
index e525ec31..efc96c9e 100644
--- a/config.json
+++ b/config.json
@@ -1,7 +1,7 @@
 {
     "model": "Tacotron2",          // one of the model in models/  
-    "run_name": "ljspeech-stft_params",
-    "run_description": "tacotron2 cosntant stf parameters",
+    "run_name": "ljspeech",
+    "run_description": "tacotron2 with guided attention and -1 1 normalization and no preemphasis",
 
     // AUDIO PARAMETERS
     "audio":{
diff --git a/train.py b/train.py
index 15c65f64..ea6d391c 100644
--- a/train.py
+++ b/train.py
@@ -47,7 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
         dataset = MyDataset(
             r,
             c.text_cleaner,
-            compute_linear_spec=True if c.model.lower() is 'tacotron' else False
+            compute_linear_spec=True if c.model.lower() is 'tacotron' else False,
             meta_data=meta_data_eval if is_val else meta_data_train,
             ap=ap,
             tp=c.characters if 'characters' in c.keys() else None,
@@ -410,7 +410,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
                              loss_dict['ga_loss'].item(),
                              keep_avg['avg_ga_loss'],
                              align_score, keep_avg['avg_align_score']),
-                    flush=Tr ue)
+                    flush=True)
 
         if args.rank == 0:
             # Diagnostic visualizations
@@ -696,6 +696,9 @@ if __name__ == '__main__':
         LOG_DIR = OUT_PATH
         tb_logger = Logger(LOG_DIR)
 
+    # write model desc to tensorboard
+    tb_logger.tb_add_text('model-description', c['run_description'], 0)
+
     try:
         main(args)
     except KeyboardInterrupt:
diff --git a/utils/logger.py b/utils/logger.py
index 51a10422..e5faeda4 100644
--- a/utils/logger.py
+++ b/utils/logger.py
@@ -75,3 +75,6 @@ class Logger(object):
 
     def tb_test_figures(self, step, figures):
         self.dict_to_tb_figure("TestFigures", figures, step)
+
+    def tb_add_text(self, title, text, step):
+        self.writer.add_text(title, text, step)