mirror of https://github.com/coqui-ai/TTS.git
Implement RMS volume normalization
This commit is contained in:
parent
56378b12f7
commit
633dcc9c56
|
@ -60,6 +60,12 @@ class BaseAudioConfig(Coqpit):
|
|||
trim_db (int):
|
||||
Silence threshold used for silence trimming. Defaults to 45.
|
||||
|
||||
do_rms_norm (bool, optional):
|
||||
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
|
||||
|
||||
db_level (int, optional):
|
||||
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
|
||||
|
||||
power (float):
|
||||
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
|
||||
artifacts in the synthesized voice. Defaults to 1.5.
|
||||
|
@ -116,6 +122,9 @@ class BaseAudioConfig(Coqpit):
|
|||
# silence trimming
|
||||
do_trim_silence: bool = True
|
||||
trim_db: int = 45
|
||||
# rms volume normalization
|
||||
do_rms_norm: bool = False
|
||||
db_level: float = None
|
||||
# griffin-lim params
|
||||
power: float = 1.5
|
||||
griffin_lim_iters: int = 60
|
||||
|
|
|
@ -266,6 +266,12 @@ class AudioProcessor(object):
|
|||
do_amp_to_db_mel (bool, optional):
|
||||
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
|
||||
|
||||
do_rms_norm (bool, optional):
|
||||
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
|
||||
|
||||
db_level (int, optional):
|
||||
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
|
||||
|
||||
stats_path (str, optional):
|
||||
Path to the computed stats file. Defaults to None.
|
||||
|
||||
|
@ -303,6 +309,8 @@ class AudioProcessor(object):
|
|||
do_sound_norm=False,
|
||||
do_amp_to_db_linear=True,
|
||||
do_amp_to_db_mel=True,
|
||||
do_rms_norm=False,
|
||||
db_level=None,
|
||||
stats_path=None,
|
||||
verbose=True,
|
||||
**_,
|
||||
|
@ -334,6 +342,8 @@ class AudioProcessor(object):
|
|||
self.do_sound_norm = do_sound_norm
|
||||
self.do_amp_to_db_linear = do_amp_to_db_linear
|
||||
self.do_amp_to_db_mel = do_amp_to_db_mel
|
||||
self.do_rms_norm = do_rms_norm
|
||||
self.db_level = db_level
|
||||
self.stats_path = stats_path
|
||||
# setup exp_func for db to amp conversion
|
||||
if log_func == "np.log":
|
||||
|
@ -726,21 +736,6 @@ class AudioProcessor(object):
|
|||
frame_period=1000 * self.hop_length / self.sample_rate,
|
||||
)
|
||||
f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
||||
# pad = int((self.win_length / self.hop_length) / 2)
|
||||
# f0 = [0.0] * pad + f0 + [0.0] * pad
|
||||
# f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
|
||||
# f0 = np.array(f0, dtype=np.float32)
|
||||
|
||||
# f01, _, _ = librosa.pyin(
|
||||
# x,
|
||||
# fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
|
||||
# fmax=self.mel_fmax,
|
||||
# frame_length=self.win_length,
|
||||
# sr=self.sample_rate,
|
||||
# fill_na=0.0,
|
||||
# )
|
||||
|
||||
# spec = self.melspectrogram(x)
|
||||
return f0
|
||||
|
||||
### Audio Processing ###
|
||||
|
@ -783,10 +778,33 @@ class AudioProcessor(object):
|
|||
"""
|
||||
return x / abs(x).max() * 0.95
|
||||
|
||||
@staticmethod
|
||||
def _rms_norm(wav, db_level=-27):
|
||||
r = 10 ** (db_level / 20)
|
||||
a = np.sqrt((len(wav) * (r ** 2)) / np.sum(wav ** 2))
|
||||
return wav * a
|
||||
|
||||
def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray:
|
||||
"""Normalize the volume based on RMS of the signal.
|
||||
|
||||
Args:
|
||||
x (np.ndarray): Raw waveform.
|
||||
|
||||
Returns:
|
||||
np.ndarray: RMS normalized waveform.
|
||||
"""
|
||||
if db_level is None:
|
||||
db_level = self.db_level
|
||||
assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0"
|
||||
wav = self._rms_norm(x, db_level)
|
||||
return wav
|
||||
|
||||
### save and load ###
|
||||
def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
|
||||
"""Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
|
||||
|
||||
Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
|
||||
|
||||
Args:
|
||||
filename (str): Path to the wav file.
|
||||
sr (int, optional): Sampling rate for resampling. Defaults to None.
|
||||
|
@ -795,8 +813,10 @@ class AudioProcessor(object):
|
|||
np.ndarray: Loaded waveform.
|
||||
"""
|
||||
if self.resample:
|
||||
# loading with resampling. It is significantly slower.
|
||||
x, sr = librosa.load(filename, sr=self.sample_rate)
|
||||
elif sr is None:
|
||||
# SF is faster than librosa for loading files
|
||||
x, sr = sf.read(filename)
|
||||
assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr)
|
||||
else:
|
||||
|
@ -808,6 +828,8 @@ class AudioProcessor(object):
|
|||
print(f" [!] File cannot be trimmed for silence - {filename}")
|
||||
if self.do_sound_norm:
|
||||
x = self.sound_norm(x)
|
||||
if self.do_rms_norm:
|
||||
x = self.rms_volume_norm(x, self.db_level)
|
||||
return x
|
||||
|
||||
def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None:
|
||||
|
|
Loading…
Reference in New Issue