Add docstrings and typing for `audio.py`

This commit is contained in:
Eren Gölge 2021-06-27 20:56:11 +02:00
parent ae6405bb76
commit 51398cd15b
1 changed files with 190 additions and 31 deletions

View File

@ -1,3 +1,5 @@
from typing import Dict, Tuple
import librosa
import numpy as np
import scipy.io.wavfile
@ -217,7 +219,12 @@ class AudioProcessor(object):
### setting up the parameters ###
def _build_mel_basis(
self,
):
) -> np.ndarray:
"""Build melspectrogram basis.
Returns:
np.ndarray: melspectrogram basis.
"""
if self.mel_fmax is not None:
assert self.mel_fmax <= self.sample_rate // 2
return librosa.filters.mel(
@ -226,8 +233,12 @@ class AudioProcessor(object):
def _stft_parameters(
self,
):
"""Compute necessary stft parameters with given time values"""
) -> Tuple[int, int]:
"""Compute the real STFT parameters from the time values.
Returns:
Tuple[int, int]: hop length and window length for STFT.
"""
factor = self.frame_length_ms / self.frame_shift_ms
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
@ -235,8 +246,18 @@ class AudioProcessor(object):
return hop_length, win_length
### normalization ###
def normalize(self, S):
"""Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]"""
def normalize(self, S: np.ndarray) -> np.ndarray:
"""Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]`
Args:
S (np.ndarray): Spectrogram to normalize.
Raises:
RuntimeError: Mean and variance is computed from incompatible parameters.
Returns:
np.ndarray: Normalized spectrogram.
"""
# pylint: disable=no-else-return
S = S.copy()
if self.signal_norm:
@ -266,8 +287,18 @@ class AudioProcessor(object):
else:
return S
def denormalize(self, S):
"""denormalize values"""
def denormalize(self, S: np.ndarray) -> np.ndarray:
"""Denormalize spectrogram values.
Args:
S (np.ndarray): Spectrogram to denormalize.
Raises:
RuntimeError: Mean and variance are incompatible.
Returns:
np.ndarray: Denormalized spectrogram.
"""
# pylint: disable=no-else-return
S_denorm = S.copy()
if self.signal_norm:
@ -295,7 +326,16 @@ class AudioProcessor(object):
return S_denorm
### Mean-STD scaling ###
def load_stats(self, stats_path):
def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]:
"""Loading mean and variance statistics from a `npy` file.
Args:
stats_path (str): Path to the `npy` file containing
Returns:
Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to
compute them.
"""
stats = np.load(stats_path, allow_pickle=True).item() # pylint: disable=unexpected-keyword-arg
mel_mean = stats["mel_mean"]
mel_std = stats["mel_std"]
@ -314,7 +354,17 @@ class AudioProcessor(object):
return mel_mean, mel_std, linear_mean, linear_std, stats_config
# pylint: disable=attribute-defined-outside-init
def setup_scaler(self, mel_mean, mel_std, linear_mean, linear_std):
def setup_scaler(
self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray
) -> None:
"""Initialize scaler objects used in mean-std normalization.
Args:
mel_mean (np.ndarray): Mean for melspectrograms.
mel_std (np.ndarray): STD for melspectrograms.
linear_mean (np.ndarray): Mean for full scale spectrograms.
linear_std (np.ndarray): STD for full scale spectrograms.
"""
self.mel_scaler = StandardScaler()
self.mel_scaler.set_stats(mel_mean, mel_std)
self.linear_scaler = StandardScaler()
@ -322,32 +372,78 @@ class AudioProcessor(object):
### DB and AMP conversion ###
# pylint: disable=no-self-use
def _amp_to_db(self, x):
def _amp_to_db(self, x: np.ndarray) -> np.ndarray:
"""Convert amplitude values to decibels.
Args:
x (np.ndarray): Amplitude spectrogram.
Returns:
np.ndarray: Decibels spectrogram.
"""
return self.spec_gain * _log(np.maximum(1e-5, x), self.base)
# pylint: disable=no-self-use
def _db_to_amp(self, x):
def _db_to_amp(self, x: np.ndarray) -> np.ndarray:
"""Convert decibels spectrogram to amplitude spectrogram.
Args:
x (np.ndarray): Decibels spectrogram.
Returns:
np.ndarray: Amplitude spectrogram.
"""
return _exp(x / self.spec_gain, self.base)
### Preemphasis ###
def apply_preemphasis(self, x):
def apply_preemphasis(self, x: np.ndarray) -> np.ndarray:
"""Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values.
Args:
x (np.ndarray): Audio signal.
Raises:
RuntimeError: Preemphasis coeff is set to 0.
Returns:
np.ndarray: Decorrelated audio signal.
"""
if self.preemphasis == 0:
raise RuntimeError(" [!] Preemphasis is set 0.0.")
return scipy.signal.lfilter([1, -self.preemphasis], [1], x)
def apply_inv_preemphasis(self, x):
def apply_inv_preemphasis(self, x: np.ndarray) -> np.ndarray:
"""Reverse pre-emphasis."""
if self.preemphasis == 0:
raise RuntimeError(" [!] Preemphasis is set 0.0.")
return scipy.signal.lfilter([1], [1, -self.preemphasis], x)
### SPECTROGRAMs ###
def _linear_to_mel(self, spectrogram):
def _linear_to_mel(self, spectrogram: np.ndarray) -> np.ndarray:
"""Project a full scale spectrogram to a melspectrogram.
Args:
spectrogram (np.ndarray): Full scale spectrogram.
Returns:
np.ndarray: Melspectrogram
"""
return np.dot(self.mel_basis, spectrogram)
def _mel_to_linear(self, mel_spec):
def _mel_to_linear(self, mel_spec: np.ndarray) -> np.ndarray:
"""Convert a melspectrogram to full scale spectrogram."""
return np.maximum(1e-10, np.dot(self.inv_mel_basis, mel_spec))
def spectrogram(self, y):
def spectrogram(self, y: np.ndarray) -> np.ndarray:
"""Compute a spectrogram from a waveform.
Args:
y (np.ndarray): Waveform.
Returns:
np.ndarray: Spectrogram.
"""
if self.preemphasis != 0:
D = self._stft(self.apply_preemphasis(y))
else:
@ -355,7 +451,8 @@ class AudioProcessor(object):
S = self._amp_to_db(np.abs(D))
return self.normalize(S).astype(np.float32)
def melspectrogram(self, y):
def melspectrogram(self, y: np.ndarray) -> np.ndarray:
"""Compute a melspectrogram from a waveform."""
if self.preemphasis != 0:
D = self._stft(self.apply_preemphasis(y))
else:
@ -363,8 +460,8 @@ class AudioProcessor(object):
S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
return self.normalize(S).astype(np.float32)
def inv_spectrogram(self, spectrogram):
"""Converts spectrogram to waveform using librosa"""
def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray:
"""Convert a spectrogram to a waveform using Griffi-Lim vocoder."""
S = self.denormalize(spectrogram)
S = self._db_to_amp(S)
# Reconstruct phase
@ -372,8 +469,8 @@ class AudioProcessor(object):
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
return self._griffin_lim(S ** self.power)
def inv_melspectrogram(self, mel_spectrogram):
"""Converts melspectrogram to waveform using librosa"""
def inv_melspectrogram(self, mel_spectrogram: np.ndarray) -> np.ndarray:
"""Convert a melspectrogram to a waveform using Griffi-Lim vocoder."""
D = self.denormalize(mel_spectrogram)
S = self._db_to_amp(D)
S = self._mel_to_linear(S) # Convert back to linear
@ -381,7 +478,15 @@ class AudioProcessor(object):
return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
return self._griffin_lim(S ** self.power)
def out_linear_to_mel(self, linear_spec):
def out_linear_to_mel(self, linear_spec: np.ndarray) -> np.ndarray:
"""Convert a full scale linear spectrogram output of a network to a melspectrogram.
Args:
linear_spec (np.ndarray): Normalized full scale linear spectrogram.
Returns:
np.ndarray: Normalized melspectrogram.
"""
S = self.denormalize(linear_spec)
S = self._db_to_amp(S)
S = self._linear_to_mel(np.abs(S))
@ -390,7 +495,15 @@ class AudioProcessor(object):
return mel
### STFT and ISTFT ###
def _stft(self, y):
def _stft(self, y: np.ndarray) -> np.ndarray:
"""Librosa STFT wrapper.
Args:
y (np.ndarray): Audio signal.
Returns:
np.ndarray: Complex number array.
"""
return librosa.stft(
y=y,
n_fft=self.fft_size,
@ -401,7 +514,8 @@ class AudioProcessor(object):
center=True,
)
def _istft(self, y):
def _istft(self, y: np.ndarray) -> np.ndarray:
"""Librosa iSTFT wrapper."""
return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length)
def _griffin_lim(self, S):
@ -414,7 +528,8 @@ class AudioProcessor(object):
return y
def compute_stft_paddings(self, x, pad_sides=1):
"""compute right padding (final frame) or both sides padding (first and final frames)"""
"""Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding
(first and final frames)"""
assert pad_sides in (1, 2)
pad = (x.shape[0] // self.hop_length + 1) * self.hop_length - x.shape[0]
if pad_sides == 1:
@ -434,7 +549,17 @@ class AudioProcessor(object):
# return f0
### Audio Processing ###
def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:
"""Find the last point without silence at the end of a audio signal.
Args:
wav (np.ndarray): Audio signal.
threshold_db (int, optional): Silence threshold in decibels. Defaults to -40.
min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8.
Returns:
int: Last point without silence.
"""
window_length = int(self.sample_rate * min_silence_sec)
hop_length = int(window_length / 4)
threshold = self._db_to_amp(threshold_db)
@ -452,11 +577,28 @@ class AudioProcessor(object):
]
@staticmethod
def sound_norm(x):
def sound_norm(x: np.ndarray) -> np.ndarray:
"""Normalize the volume of an audio signal.
Args:
x (np.ndarray): Raw waveform.
Returns:
np.ndarray: Volume normalized waveform.
"""
return x / abs(x).max() * 0.95
### save and load ###
def load_wav(self, filename, sr=None):
def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
"""Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
Args:
filename (str): Path to the wav file.
sr (int, optional): Sampling rate for resampling. Defaults to None.
Returns:
np.ndarray: Loaded waveform.
"""
if self.resample:
x, sr = librosa.load(filename, sr=self.sample_rate)
elif sr is None:
@ -473,12 +615,19 @@ class AudioProcessor(object):
x = self.sound_norm(x)
return x
def save_wav(self, wav, path, sr=None):
def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None:
"""Save a waveform to a file using Scipy.
Args:
wav (np.ndarray): Waveform to save.
path (str): Path to a output file.
sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
"""
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))
@staticmethod
def mulaw_encode(wav, qc):
def mulaw_encode(wav: np.ndarray, qc: int) -> np.ndarray:
mu = 2 ** qc - 1
# wav_abs = np.minimum(np.abs(wav), 1.0)
signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu)
@ -500,11 +649,21 @@ class AudioProcessor(object):
return np.clip(x * 2 ** 15, -(2 ** 15), 2 ** 15 - 1).astype(np.int16)
@staticmethod
def quantize(x, bits):
def quantize(x: np.ndarray, bits: int) -> np.ndarray:
"""Quantize a waveform to a given number of bits.
Args:
x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`.
bits (int): Number of quantization bits.
Returns:
np.ndarray: Quantized waveform.
"""
return (x + 1.0) * (2 ** bits - 1) / 2
@staticmethod
def dequantize(x, bits):
"""Dequantize a waveform from the given number of bits."""
return 2 * x / (2 ** bits - 1) - 1