Implement RMS volume normalization

This commit is contained in:
Eren Gölge 2021-12-22 15:51:14 +00:00
parent 56378b12f7
commit 633dcc9c56
2 changed files with 46 additions and 15 deletions

View File

@ -60,6 +60,12 @@ class BaseAudioConfig(Coqpit):
trim_db (int): trim_db (int):
Silence threshold used for silence trimming. Defaults to 45. Silence threshold used for silence trimming. Defaults to 45.
do_rms_norm (bool, optional):
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
db_level (int, optional):
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
power (float): power (float):
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
artifacts in the synthesized voice. Defaults to 1.5. artifacts in the synthesized voice. Defaults to 1.5.
@ -116,6 +122,9 @@ class BaseAudioConfig(Coqpit):
# silence trimming # silence trimming
do_trim_silence: bool = True do_trim_silence: bool = True
trim_db: int = 45 trim_db: int = 45
# rms volume normalization
do_rms_norm: bool = False
db_level: float = None
# griffin-lim params # griffin-lim params
power: float = 1.5 power: float = 1.5
griffin_lim_iters: int = 60 griffin_lim_iters: int = 60

View File

@ -266,6 +266,12 @@ class AudioProcessor(object):
do_amp_to_db_mel (bool, optional): do_amp_to_db_mel (bool, optional):
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
do_rms_norm (bool, optional):
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
db_level (int, optional):
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
stats_path (str, optional): stats_path (str, optional):
Path to the computed stats file. Defaults to None. Path to the computed stats file. Defaults to None.
@ -303,6 +309,8 @@ class AudioProcessor(object):
do_sound_norm=False, do_sound_norm=False,
do_amp_to_db_linear=True, do_amp_to_db_linear=True,
do_amp_to_db_mel=True, do_amp_to_db_mel=True,
do_rms_norm=False,
db_level=None,
stats_path=None, stats_path=None,
verbose=True, verbose=True,
**_, **_,
@ -334,6 +342,8 @@ class AudioProcessor(object):
self.do_sound_norm = do_sound_norm self.do_sound_norm = do_sound_norm
self.do_amp_to_db_linear = do_amp_to_db_linear self.do_amp_to_db_linear = do_amp_to_db_linear
self.do_amp_to_db_mel = do_amp_to_db_mel self.do_amp_to_db_mel = do_amp_to_db_mel
self.do_rms_norm = do_rms_norm
self.db_level = db_level
self.stats_path = stats_path self.stats_path = stats_path
# setup exp_func for db to amp conversion # setup exp_func for db to amp conversion
if log_func == "np.log": if log_func == "np.log":
@ -726,21 +736,6 @@ class AudioProcessor(object):
frame_period=1000 * self.hop_length / self.sample_rate, frame_period=1000 * self.hop_length / self.sample_rate,
) )
f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
# pad = int((self.win_length / self.hop_length) / 2)
# f0 = [0.0] * pad + f0 + [0.0] * pad
# f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
# f0 = np.array(f0, dtype=np.float32)
# f01, _, _ = librosa.pyin(
# x,
# fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
# fmax=self.mel_fmax,
# frame_length=self.win_length,
# sr=self.sample_rate,
# fill_na=0.0,
# )
# spec = self.melspectrogram(x)
return f0 return f0
### Audio Processing ### ### Audio Processing ###
@ -783,10 +778,33 @@ class AudioProcessor(object):
""" """
return x / abs(x).max() * 0.95 return x / abs(x).max() * 0.95
@staticmethod
def _rms_norm(wav, db_level=-27):
r = 10 ** (db_level / 20)
a = np.sqrt((len(wav) * (r ** 2)) / np.sum(wav ** 2))
return wav * a
def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray:
"""Normalize the volume based on RMS of the signal.
Args:
x (np.ndarray): Raw waveform.
Returns:
np.ndarray: RMS normalized waveform.
"""
if db_level is None:
db_level = self.db_level
assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0"
wav = self._rms_norm(x, db_level)
return wav
### save and load ### ### save and load ###
def load_wav(self, filename: str, sr: int = None) -> np.ndarray: def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
"""Read a wav file using Librosa and optionally resample, silence trim, volume normalize. """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
Args: Args:
filename (str): Path to the wav file. filename (str): Path to the wav file.
sr (int, optional): Sampling rate for resampling. Defaults to None. sr (int, optional): Sampling rate for resampling. Defaults to None.
@ -795,8 +813,10 @@ class AudioProcessor(object):
np.ndarray: Loaded waveform. np.ndarray: Loaded waveform.
""" """
if self.resample: if self.resample:
# loading with resampling. It is significantly slower.
x, sr = librosa.load(filename, sr=self.sample_rate) x, sr = librosa.load(filename, sr=self.sample_rate)
elif sr is None: elif sr is None:
# SF is faster than librosa for loading files
x, sr = sf.read(filename) x, sr = sf.read(filename)
assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr) assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr)
else: else:
@ -808,6 +828,8 @@ class AudioProcessor(object):
print(f" [!] File cannot be trimmed for silence - {filename}") print(f" [!] File cannot be trimmed for silence - {filename}")
if self.do_sound_norm: if self.do_sound_norm:
x = self.sound_norm(x) x = self.sound_norm(x)
if self.do_rms_norm:
x = self.rms_volume_norm(x, self.db_level)
return x return x
def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: