compute stft paddings to correct wav and spec alignment aespecially for vocoder training

This commit is contained in:
erogol 2020-03-26 21:10:37 +01:00
parent 52c0b4e3e1
commit d5efe040f7
1 changed files with 12 additions and 2 deletions

View File

@ -114,7 +114,7 @@ class AudioProcessor(object):
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
# range normalization
S -= self.ref_level_db # discard certain range of DB assuming it is air noise
S_norm = ((S - self.min_level_db) / - self.min_level_db)
S_norm = ((S - self.min_level_db) / (-self.min_level_db))
if self.symmetric_norm:
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
if self.clip_norm:
@ -269,7 +269,17 @@ class AudioProcessor(object):
y = self._istft(S_complex * angles)
return y
### Audio Processing ###
def compute_stft_paddings(x, fsize, fshift, pad_sides=1):
'''compute right padding (final frame) or both sides padding (first and final frames)
'''
assert pad_sides in (1, 2)
# return int(fsize // 2)
pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
if pad_sides == 1:
return 0, pad
else:
return pad // 2, pad // 2 + pad % 2Processing ###
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
window_length = int(self.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)