compute stft paddings to correct wav and spec alignment aespecially for vocoder training

2020-03-26 21:10:37 +01:00 · 2020-03-26 21:10:37 +01:00 · d5efe040f7
parent 52c0b4e3e1
commit d5efe040f7
1 changed files with 12 additions and 2 deletions
--- a/utils/audio.py
+++ b/utils/audio.py
@ -114,7 +114,7 @@ class AudioProcessor(object):
                    raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
            # range normalization
            S -= self.ref_level_db  # discard certain range of DB assuming it is air noise
-            S_norm = ((S - self.min_level_db) / - self.min_level_db)
+            S_norm = ((S - self.min_level_db) / (-self.min_level_db))
            if self.symmetric_norm:
                S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
                if self.clip_norm:
@ -269,7 +269,17 @@ class AudioProcessor(object):
            y = self._istft(S_complex * angles)
        return y

-    ### Audio Processing ###
+    def compute_stft_paddings(x, fsize, fshift, pad_sides=1):
+        '''compute right padding (final frame) or both sides padding (first and final frames)
+        '''
+        assert pad_sides in (1, 2)
+        # return int(fsize // 2)
+        pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+        if pad_sides == 1:
+            return 0, pad
+        else:
+            return pad // 2, pad // 2 + pad % 2Processing ###
+
    def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
        window_length = int(self.sample_rate * min_silence_sec)
        hop_length = int(window_length / 4)