Implement RMS volume normalization

This commit is contained in:
Eren Gölge 2021-12-22 15:51:14 +00:00
parent 56378b12f7
commit 633dcc9c56
2 changed files with 46 additions and 15 deletions

View File

@ -60,6 +60,12 @@ class BaseAudioConfig(Coqpit):
trim_db (int):
Silence threshold used for silence trimming. Defaults to 45.
do_rms_norm (bool, optional):
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
db_level (int, optional):
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
power (float):
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
artifacts in the synthesized voice. Defaults to 1.5.
@ -116,6 +122,9 @@ class BaseAudioConfig(Coqpit):
# silence trimming
do_trim_silence: bool = True
trim_db: int = 45
# rms volume normalization
do_rms_norm: bool = False
db_level: float = None
# griffin-lim params
power: float = 1.5
griffin_lim_iters: int = 60

View File

@ -266,6 +266,12 @@ class AudioProcessor(object):
do_amp_to_db_mel (bool, optional):
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
do_rms_norm (bool, optional):
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
db_level (int, optional):
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
stats_path (str, optional):
Path to the computed stats file. Defaults to None.
@ -303,6 +309,8 @@ class AudioProcessor(object):
do_sound_norm=False,
do_amp_to_db_linear=True,
do_amp_to_db_mel=True,
do_rms_norm=False,
db_level=None,
stats_path=None,
verbose=True,
**_,
@ -334,6 +342,8 @@ class AudioProcessor(object):
self.do_sound_norm = do_sound_norm
self.do_amp_to_db_linear = do_amp_to_db_linear
self.do_amp_to_db_mel = do_amp_to_db_mel
self.do_rms_norm = do_rms_norm
self.db_level = db_level
self.stats_path = stats_path
# setup exp_func for db to amp conversion
if log_func == "np.log":
@ -726,21 +736,6 @@ class AudioProcessor(object):
frame_period=1000 * self.hop_length / self.sample_rate,
)
f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
# pad = int((self.win_length / self.hop_length) / 2)
# f0 = [0.0] * pad + f0 + [0.0] * pad
# f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
# f0 = np.array(f0, dtype=np.float32)
# f01, _, _ = librosa.pyin(
# x,
# fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
# fmax=self.mel_fmax,
# frame_length=self.win_length,
# sr=self.sample_rate,
# fill_na=0.0,
# )
# spec = self.melspectrogram(x)
return f0
### Audio Processing ###
@ -783,10 +778,33 @@ class AudioProcessor(object):
"""
return x / abs(x).max() * 0.95
@staticmethod
def _rms_norm(wav, db_level=-27):
r = 10 ** (db_level / 20)
a = np.sqrt((len(wav) * (r ** 2)) / np.sum(wav ** 2))
return wav * a
def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray:
"""Normalize the volume based on RMS of the signal.
Args:
x (np.ndarray): Raw waveform.
Returns:
np.ndarray: RMS normalized waveform.
"""
if db_level is None:
db_level = self.db_level
assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0"
wav = self._rms_norm(x, db_level)
return wav
### save and load ###
def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
"""Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
Args:
filename (str): Path to the wav file.
sr (int, optional): Sampling rate for resampling. Defaults to None.
@ -795,8 +813,10 @@ class AudioProcessor(object):
np.ndarray: Loaded waveform.
"""
if self.resample:
# loading with resampling. It is significantly slower.
x, sr = librosa.load(filename, sr=self.sample_rate)
elif sr is None:
# SF is faster than librosa for loading files
x, sr = sf.read(filename)
assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr)
else:
@ -808,6 +828,8 @@ class AudioProcessor(object):
print(f" [!] File cannot be trimmed for silence - {filename}")
if self.do_sound_norm:
x = self.sound_norm(x)
if self.do_rms_norm:
x = self.rms_volume_norm(x, self.db_level)
return x
def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: