From 355dfee98d5d5dd42c34288642cdb66ef221a0ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 30 Sep 2021 14:38:10 +0000 Subject: [PATCH] Add mfcc to AudioProcessor --- TTS/utils/audio.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index f5fb1d7f..c8a7b67d 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -122,6 +122,9 @@ class AudioProcessor(object): num_mels (int, optional): number of melspectrogram dimensions. Defaults to None. + num_mfcc (int): + Number of MFCC values to compute. Defaults to None. + log_func (int, optional): log exponent used for converting spectrogram aplitude to DB. @@ -207,6 +210,7 @@ class AudioProcessor(object): sample_rate=None, resample=False, num_mels=None, + num_mfcc=None, log_func="np.log10", min_level_db=None, frame_shift_ms=None, @@ -240,6 +244,7 @@ class AudioProcessor(object): self.sample_rate = sample_rate self.resample = resample self.num_mels = num_mels + self.num_mfcc = num_mfcc self.log_func = log_func self.min_level_db = min_level_db or 0 self.frame_shift_ms = frame_shift_ms @@ -546,6 +551,22 @@ class AudioProcessor(object): S = self._linear_to_mel(np.abs(D)) return self.normalize(S).astype(np.float32) + def mfcc(self, y: np.ndarray) -> np.ndarray: + """Compute MFCC values from a waveform.""" + mel_args = { + "n_fft": self.fft_size, + "n_mels": self.num_mels, + "hop_length": self.hop_length, + "win_length": self.win_length, + "window": "hann", + "center": True, + "pad_mode": self.stft_pad_mode, + "fmin": self.mel_fmin, + "fmax": self.mel_fmax, + } + mfcc = librosa.feature.mfcc(y=y, sr=self.sample_rate, n_mfcc=self.num_mfcc, **mel_args) + return mfcc + def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray: """Convert a spectrogram to a waveform using Griffi-Lim vocoder.""" S = self.denormalize(spectrogram)