From 2f3f43844e80306a5d83e42101cfa5e937ef4583 Mon Sep 17 00:00:00 2001 From: erogol Date: Mon, 22 Jun 2020 14:55:11 +0200 Subject: [PATCH] spec_gain fft_size and stft_pad_mode parameters for audio_processor --- train.py | 3 --- utils/audio.py | 28 +++++++++++++++------------- utils/generic_utils.py | 5 +++-- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/train.py b/train.py index 8581132a..bdafaeba 100644 --- a/train.py +++ b/train.py @@ -473,8 +473,6 @@ def main(args): # pylint: disable=redefined-outer-name model = setup_model(num_chars, num_speakers, c) - print(" | > Num output units : {}".format(ap.num_freq), flush=True) - params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: @@ -542,7 +540,6 @@ def main(args): # pylint: disable=redefined-outer-name if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) - train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch) diff --git a/utils/audio.py b/utils/audio.py index f941f609..31c065ff 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -17,7 +17,7 @@ class AudioProcessor(object): hop_length=None, win_length=None, ref_level_db=None, - num_freq=None, + fft_size=1024, power=None, preemphasis=0.0, signal_norm=None, @@ -25,6 +25,8 @@ class AudioProcessor(object): max_norm=None, mel_fmin=None, mel_fmax=None, + spec_gain=20, + stft_pad_mode='reflect', clip_norm=True, griffin_lim_iters=None, do_trim_silence=False, @@ -41,7 +43,7 @@ class AudioProcessor(object): self.frame_shift_ms = frame_shift_ms self.frame_length_ms = frame_length_ms self.ref_level_db = ref_level_db - self.num_freq = num_freq + self.fft_size = fft_size self.power = power self.preemphasis = preemphasis self.griffin_lim_iters = griffin_lim_iters @@ -49,6 +51,8 @@ class AudioProcessor(object): self.symmetric_norm = symmetric_norm self.mel_fmin = mel_fmin or 0 self.mel_fmax = mel_fmax + self.spec_gain = float(spec_gain) + self.stft_pad_mode = 'reflect' self.max_norm = 1.0 if max_norm is None else float(max_norm) self.clip_norm = clip_norm self.do_trim_silence = do_trim_silence @@ -58,12 +62,11 @@ class AudioProcessor(object): # setup stft parameters if hop_length is None: # compute stft parameters from given time values - self.n_fft, self.hop_length, self.win_length = self._stft_parameters() + self.hop_length, self.win_length = self._stft_parameters() else: # use stft parameters from config file self.hop_length = hop_length self.win_length = win_length - self.n_fft = (self.num_freq - 1) * 2 assert min_level_db != 0.0, " [!] min_level_db is 0" members = vars(self) for key, value in members.items(): @@ -86,19 +89,18 @@ class AudioProcessor(object): assert self.mel_fmax <= self.sample_rate // 2 return librosa.filters.mel( self.sample_rate, - self.n_fft, + self.fft_size, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax) def _stft_parameters(self, ): """Compute necessary stft parameters with given time values""" - n_fft = (self.num_freq - 1) * 2 factor = self.frame_length_ms / self.frame_shift_ms assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) win_length = int(hop_length * factor) - return n_fft, hop_length, win_length + return hop_length, win_length ### normalization ### def _normalize(self, S): @@ -110,7 +112,7 @@ class AudioProcessor(object): if hasattr(self, 'mel_scaler'): if S.shape[0] == self.num_mels: return self.mel_scaler.transform(S.T).T - elif S.shape[0] == self.n_fft / 2: + elif S.shape[0] == self.fft_size / 2: return self.linear_scaler.transform(S.T).T else: raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') @@ -139,7 +141,7 @@ class AudioProcessor(object): if hasattr(self, 'mel_scaler'): if S_denorm.shape[0] == self.num_mels: return self.mel_scaler.inverse_transform(S_denorm.T).T - elif S_denorm.shape[0] == self.n_fft / 2: + elif S_denorm.shape[0] == self.fft_size / 2: return self.linear_scaler.inverse_transform(S_denorm.T).T else: raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') @@ -184,11 +186,11 @@ class AudioProcessor(object): ### DB and AMP conversion ### # pylint: disable=no-self-use def _amp_to_db(self, x): - return 20 * np.log10(np.maximum(1e-5, x)) + return self.spec_gain * np.log10(np.maximum(1e-5, x)) # pylint: disable=no-self-use def _db_to_amp(self, x): - return np.power(10.0, x * 0.05) + return np.power(10.0, x / self.spec_gain) ### Preemphasis ### def apply_preemphasis(self, x): @@ -254,10 +256,10 @@ class AudioProcessor(object): def _stft(self, y): return librosa.stft( y=y, - n_fft=self.n_fft, + n_fft=self.fft_size, hop_length=self.hop_length, win_length=self.win_length, - pad_mode='constant' + pad_mode=self.stft_pad_mode, ) def _istft(self, y): diff --git a/utils/generic_utils.py b/utils/generic_utils.py index c50f8060..c806bdf3 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -146,7 +146,7 @@ def setup_model(num_chars, num_speakers, c): model = MyModel(num_chars=num_chars, num_speakers=num_speakers, r=c.r, - postnet_output_dim=c.audio['num_freq'], + postnet_output_dim=int(c.audio['fft_size'] / 2 + 1), decoder_output_dim=c.audio['num_mels'], gst=c.use_gst, memory_size=c.memory_size, @@ -252,7 +252,7 @@ def check_config(c): # audio processing parameters _check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) - _check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) + _check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) _check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) _check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') _check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') @@ -278,6 +278,7 @@ def check_config(c): _check_argument('clip_norm', c['audio'], restricted=True, val_type=bool) _check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000) _check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0) + _check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100) _check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool) _check_argument('trim_db', c['audio'], restricted=True, val_type=int)