remove zero-frame in dataloader

2019-11-19 12:58:54 +01:00 · 2019-11-19 12:58:54 +01:00 · 4873620bc2
parent ee788bc558
commit 4873620bc2
2 changed files with 7 additions and 6 deletions
--- a/datasets/TTSDataset.py
+++ b/datasets/TTSDataset.py
@ -193,22 +193,22 @@ class MyDataset(Dataset):
            mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
            linear = [self.ap.spectrogram(w).astype('float32') for w in wav]

-            mel_lengths = [m.shape[1] + 1 for m in mel]  # +1 for zero-frame
+            mel_lengths = [m.shape[1] for m in mel] 

            # compute 'stop token' targets
            stop_targets = [
-                np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths
+                np.array([0.] * (mel_len - 1) + [1.]) for mel_len in mel_lengths
            ]

            # PAD stop targets
            stop_targets = prepare_stop_target(stop_targets,
                                               self.outputs_per_step)

-            # PAD sequences with largest length of the batch
+            # PAD sequences with longest instance in the batch
            text = prepare_data(text).astype(np.int32)
            wav = prepare_data(wav)

-            # PAD features with largest length + a zero frame
+            # PAD features with longest instance
            linear = prepare_tensor(linear, self.outputs_per_step)
            mel = prepare_tensor(mel, self.outputs_per_step)
            assert mel.shape[2] == linear.shape[2]
--- a/utils/data.py
+++ b/utils/data.py
@ -24,7 +24,7 @@ def _pad_tensor(x, length):


 def prepare_tensor(inputs, out_steps):
-    max_len = max((x.shape[1] for x in inputs)) + 1  # zero-frame
+    max_len = max((x.shape[1] for x in inputs))
    remainder = max_len % out_steps
    pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
    return np.stack([_pad_tensor(x, pad_len) for x in inputs])
@ -38,7 +38,8 @@ def _pad_stop_target(x, length):


 def prepare_stop_target(inputs, out_steps):
-    max_len = max((x.shape[0] for x in inputs)) + 1  # zero-frame
+    """ Pad row vectors with 1. """
+    max_len = max((x.shape[0] for x in inputs))
    remainder = max_len % out_steps
    pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len
    return np.stack([_pad_stop_target(x, pad_len) for x in inputs])