From 060e746e21d895bbe8a3430ff7e5f2df5daa00ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 9 Aug 2021 07:56:11 +0000 Subject: [PATCH] Add `do_amp_to_db` option --- TTS/config/shared_configs.py | 6 ++ TTS/utils/audio.py | 146 +++++++++++++++++++++++++++-------- 2 files changed, 121 insertions(+), 31 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 1dfa7706..dd92da65 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -36,6 +36,10 @@ class BaseAudioConfig(Coqpit): Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False. do_trim_silence (bool): Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```. + do_amp_to_db_linear (bool, optional): + enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True. + do_amp_to_db_mel (bool, optional): + enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. trim_db (int): Silence threshold used for silence trimming. Defaults to 45. power (float): @@ -91,6 +95,8 @@ class BaseAudioConfig(Coqpit): mel_fmin: float = 0.0 mel_fmax: float = None spec_gain: int = 20 + do_amp_to_db_linear: bool = True + do_amp_to_db_mel: bool = True # normalization params signal_norm: bool = True min_level_db: int = -100 diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index 27b52bef..1f21369f 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -14,7 +14,10 @@ from TTS.tts.utils.data import StandardScaler class TorchSTFT(nn.Module): # pylint: disable=abstract-method - """TODO: Merge this with audio.py""" + """Some of the audio processing funtions using Torch for faster batch processing. + + TODO: Merge this with audio.py + """ def __init__( self, @@ -28,6 +31,8 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method mel_fmax=None, n_mels=80, use_mel=False, + do_amp_to_db=False, + spec_gain=1.0, ): super().__init__() self.n_fft = n_fft @@ -39,6 +44,8 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method self.mel_fmax = mel_fmax self.n_mels = n_mels self.use_mel = use_mel + self.do_amp_to_db = do_amp_to_db + self.spec_gain = spec_gain self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) self.mel_basis = None if use_mel: @@ -79,6 +86,8 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8)) if self.use_mel: S = torch.matmul(self.mel_basis.to(x), S) + if self.do_amp_to_db: + S = self._amp_to_db(S, spec_gain=self.spec_gain) return S def _build_mel_basis(self): @@ -87,6 +96,12 @@ class TorchSTFT(nn.Module): # pylint: disable=abstract-method ) self.mel_basis = torch.from_numpy(mel_basis).float() + def _amp_to_db(self, x, spec_gain=1.0): + return torch.log(torch.clamp(x, min=1e-5) * spec_gain) + + def _db_to_amp(self, x, spec_gain=1.0): + return torch.exp(x) / spec_gain + # pylint: disable=too-many-public-methods class AudioProcessor(object): @@ -97,33 +112,93 @@ class AudioProcessor(object): of the class with the model config. They are not meaningful for all the arguments. Args: - sample_rate (int, optional): target audio sampling rate. Defaults to None. - resample (bool, optional): enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False. - num_mels (int, optional): number of melspectrogram dimensions. Defaults to None. - log_func (int, optional): log exponent used for converting spectrogram aplitude to DB. - min_level_db (int, optional): minimum db threshold for the computed melspectrograms. Defaults to None. - frame_shift_ms (int, optional): milliseconds of frames between STFT columns. Defaults to None. - frame_length_ms (int, optional): milliseconds of STFT window length. Defaults to None. - hop_length (int, optional): number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None. - win_length (int, optional): STFT window length. Used if ```frame_length_ms``` is None. Defaults to None. - ref_level_db (int, optional): reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None. - fft_size (int, optional): FFT window size for STFT. Defaults to 1024. - power (int, optional): Exponent value applied to the spectrogram before GriffinLim. Defaults to None. - preemphasis (float, optional): Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0. - signal_norm (bool, optional): enable/disable signal normalization. Defaults to None. - symmetric_norm (bool, optional): enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None. - max_norm (float, optional): ```k``` defining the normalization range. Defaults to None. - mel_fmin (int, optional): minimum filter frequency for computing melspectrograms. Defaults to None. - mel_fmax (int, optional): maximum filter frequency for computing melspectrograms.. Defaults to None. - spec_gain (int, optional): gain applied when converting amplitude to DB. Defaults to 20. - stft_pad_mode (str, optional): Padding mode for STFT. Defaults to 'reflect'. - clip_norm (bool, optional): enable/disable clipping the our of range values in the normalized audio signal. Defaults to True. - griffin_lim_iters (int, optional): Number of GriffinLim iterations. Defaults to None. - do_trim_silence (bool, optional): enable/disable silence trimming when loading the audio signal. Defaults to False. - trim_db (int, optional): DB threshold used for silence trimming. Defaults to 60. - do_sound_norm (bool, optional): enable/disable signal normalization. Defaults to False. - stats_path (str, optional): Path to the computed stats file. Defaults to None. - verbose (bool, optional): enable/disable logging. Defaults to True. + sample_rate (int, optional): + target audio sampling rate. Defaults to None. + + resample (bool, optional): + enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False. + + num_mels (int, optional): + number of melspectrogram dimensions. Defaults to None. + + log_func (int, optional): + log exponent used for converting spectrogram aplitude to DB. + + min_level_db (int, optional): + minimum db threshold for the computed melspectrograms. Defaults to None. + + frame_shift_ms (int, optional): + milliseconds of frames between STFT columns. Defaults to None. + + frame_length_ms (int, optional): + milliseconds of STFT window length. Defaults to None. + + hop_length (int, optional): + number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None. + + win_length (int, optional): + STFT window length. Used if ```frame_length_ms``` is None. Defaults to None. + + ref_level_db (int, optional): + reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None. + + fft_size (int, optional): + FFT window size for STFT. Defaults to 1024. + + power (int, optional): + Exponent value applied to the spectrogram before GriffinLim. Defaults to None. + + preemphasis (float, optional): + Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0. + + signal_norm (bool, optional): + enable/disable signal normalization. Defaults to None. + + symmetric_norm (bool, optional): + enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None. + + max_norm (float, optional): + ```k``` defining the normalization range. Defaults to None. + + mel_fmin (int, optional): + minimum filter frequency for computing melspectrograms. Defaults to None. + + mel_fmax (int, optional): + maximum filter frequency for computing melspectrograms.. Defaults to None. + + spec_gain (int, optional): + gain applied when converting amplitude to DB. Defaults to 20. + + stft_pad_mode (str, optional): + Padding mode for STFT. Defaults to 'reflect'. + + clip_norm (bool, optional): + enable/disable clipping the our of range values in the normalized audio signal. Defaults to True. + + griffin_lim_iters (int, optional): + Number of GriffinLim iterations. Defaults to None. + + do_trim_silence (bool, optional): + enable/disable silence trimming when loading the audio signal. Defaults to False. + + trim_db (int, optional): + DB threshold used for silence trimming. Defaults to 60. + + do_sound_norm (bool, optional): + enable/disable signal normalization. Defaults to False. + + do_amp_to_db_linear (bool, optional): + enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True. + + do_amp_to_db_mel (bool, optional): + enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. + + stats_path (str, optional): + Path to the computed stats file. Defaults to None. + + verbose (bool, optional): + enable/disable logging. Defaults to True. + """ def __init__( @@ -153,6 +228,8 @@ class AudioProcessor(object): do_trim_silence=False, trim_db=60, do_sound_norm=False, + do_amp_to_db_linear=True, + do_amp_to_db_mel=True, stats_path=None, verbose=True, **_, @@ -181,6 +258,8 @@ class AudioProcessor(object): self.do_trim_silence = do_trim_silence self.trim_db = trim_db self.do_sound_norm = do_sound_norm + self.do_amp_to_db_linear = do_amp_to_db_linear + self.do_amp_to_db_mel = do_amp_to_db_mel self.stats_path = stats_path # setup exp_func for db to amp conversion if log_func == "np.log": @@ -381,7 +460,6 @@ class AudioProcessor(object): Returns: np.ndarray: Decibels spectrogram. """ - return self.spec_gain * _log(np.maximum(1e-5, x), self.base) # pylint: disable=no-self-use @@ -448,7 +526,10 @@ class AudioProcessor(object): D = self._stft(self.apply_preemphasis(y)) else: D = self._stft(y) - S = self._amp_to_db(np.abs(D)) + if self.do_amp_to_db_linear: + S = self._amp_to_db(np.abs(D)) + else: + S = np.abs(D) return self.normalize(S).astype(np.float32) def melspectrogram(self, y: np.ndarray) -> np.ndarray: @@ -457,7 +538,10 @@ class AudioProcessor(object): D = self._stft(self.apply_preemphasis(y)) else: D = self._stft(y) - S = self._amp_to_db(self._linear_to_mel(np.abs(D))) + if self.do_amp_to_db_mel: + S = self._amp_to_db(self._linear_to_mel(np.abs(D))) + else: + S = self._linear_to_mel(np.abs(D)) return self.normalize(S).astype(np.float32) def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray: