mirror of https://github.com/coqui-ai/TTS.git
Implement RMS volume normalization
This commit is contained in:
parent
56378b12f7
commit
633dcc9c56
|
@ -60,6 +60,12 @@ class BaseAudioConfig(Coqpit):
|
||||||
trim_db (int):
|
trim_db (int):
|
||||||
Silence threshold used for silence trimming. Defaults to 45.
|
Silence threshold used for silence trimming. Defaults to 45.
|
||||||
|
|
||||||
|
do_rms_norm (bool, optional):
|
||||||
|
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
|
||||||
|
|
||||||
|
db_level (int, optional):
|
||||||
|
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
|
||||||
|
|
||||||
power (float):
|
power (float):
|
||||||
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
|
Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
|
||||||
artifacts in the synthesized voice. Defaults to 1.5.
|
artifacts in the synthesized voice. Defaults to 1.5.
|
||||||
|
@ -116,6 +122,9 @@ class BaseAudioConfig(Coqpit):
|
||||||
# silence trimming
|
# silence trimming
|
||||||
do_trim_silence: bool = True
|
do_trim_silence: bool = True
|
||||||
trim_db: int = 45
|
trim_db: int = 45
|
||||||
|
# rms volume normalization
|
||||||
|
do_rms_norm: bool = False
|
||||||
|
db_level: float = None
|
||||||
# griffin-lim params
|
# griffin-lim params
|
||||||
power: float = 1.5
|
power: float = 1.5
|
||||||
griffin_lim_iters: int = 60
|
griffin_lim_iters: int = 60
|
||||||
|
|
|
@ -266,6 +266,12 @@ class AudioProcessor(object):
|
||||||
do_amp_to_db_mel (bool, optional):
|
do_amp_to_db_mel (bool, optional):
|
||||||
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
|
enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
|
||||||
|
|
||||||
|
do_rms_norm (bool, optional):
|
||||||
|
enable/disable RMS volume normalization when loading an audio file. Defaults to False.
|
||||||
|
|
||||||
|
db_level (int, optional):
|
||||||
|
dB level used for rms normalization. The range is -99 to 0. Defaults to None.
|
||||||
|
|
||||||
stats_path (str, optional):
|
stats_path (str, optional):
|
||||||
Path to the computed stats file. Defaults to None.
|
Path to the computed stats file. Defaults to None.
|
||||||
|
|
||||||
|
@ -303,6 +309,8 @@ class AudioProcessor(object):
|
||||||
do_sound_norm=False,
|
do_sound_norm=False,
|
||||||
do_amp_to_db_linear=True,
|
do_amp_to_db_linear=True,
|
||||||
do_amp_to_db_mel=True,
|
do_amp_to_db_mel=True,
|
||||||
|
do_rms_norm=False,
|
||||||
|
db_level=None,
|
||||||
stats_path=None,
|
stats_path=None,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
**_,
|
**_,
|
||||||
|
@ -334,6 +342,8 @@ class AudioProcessor(object):
|
||||||
self.do_sound_norm = do_sound_norm
|
self.do_sound_norm = do_sound_norm
|
||||||
self.do_amp_to_db_linear = do_amp_to_db_linear
|
self.do_amp_to_db_linear = do_amp_to_db_linear
|
||||||
self.do_amp_to_db_mel = do_amp_to_db_mel
|
self.do_amp_to_db_mel = do_amp_to_db_mel
|
||||||
|
self.do_rms_norm = do_rms_norm
|
||||||
|
self.db_level = db_level
|
||||||
self.stats_path = stats_path
|
self.stats_path = stats_path
|
||||||
# setup exp_func for db to amp conversion
|
# setup exp_func for db to amp conversion
|
||||||
if log_func == "np.log":
|
if log_func == "np.log":
|
||||||
|
@ -726,21 +736,6 @@ class AudioProcessor(object):
|
||||||
frame_period=1000 * self.hop_length / self.sample_rate,
|
frame_period=1000 * self.hop_length / self.sample_rate,
|
||||||
)
|
)
|
||||||
f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
||||||
# pad = int((self.win_length / self.hop_length) / 2)
|
|
||||||
# f0 = [0.0] * pad + f0 + [0.0] * pad
|
|
||||||
# f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
|
|
||||||
# f0 = np.array(f0, dtype=np.float32)
|
|
||||||
|
|
||||||
# f01, _, _ = librosa.pyin(
|
|
||||||
# x,
|
|
||||||
# fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
|
|
||||||
# fmax=self.mel_fmax,
|
|
||||||
# frame_length=self.win_length,
|
|
||||||
# sr=self.sample_rate,
|
|
||||||
# fill_na=0.0,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# spec = self.melspectrogram(x)
|
|
||||||
return f0
|
return f0
|
||||||
|
|
||||||
### Audio Processing ###
|
### Audio Processing ###
|
||||||
|
@ -783,10 +778,33 @@ class AudioProcessor(object):
|
||||||
"""
|
"""
|
||||||
return x / abs(x).max() * 0.95
|
return x / abs(x).max() * 0.95
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _rms_norm(wav, db_level=-27):
|
||||||
|
r = 10 ** (db_level / 20)
|
||||||
|
a = np.sqrt((len(wav) * (r ** 2)) / np.sum(wav ** 2))
|
||||||
|
return wav * a
|
||||||
|
|
||||||
|
def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray:
|
||||||
|
"""Normalize the volume based on RMS of the signal.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (np.ndarray): Raw waveform.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: RMS normalized waveform.
|
||||||
|
"""
|
||||||
|
if db_level is None:
|
||||||
|
db_level = self.db_level
|
||||||
|
assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0"
|
||||||
|
wav = self._rms_norm(x, db_level)
|
||||||
|
return wav
|
||||||
|
|
||||||
### save and load ###
|
### save and load ###
|
||||||
def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
|
def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
|
||||||
"""Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
|
"""Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
|
||||||
|
|
||||||
|
Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
filename (str): Path to the wav file.
|
filename (str): Path to the wav file.
|
||||||
sr (int, optional): Sampling rate for resampling. Defaults to None.
|
sr (int, optional): Sampling rate for resampling. Defaults to None.
|
||||||
|
@ -795,8 +813,10 @@ class AudioProcessor(object):
|
||||||
np.ndarray: Loaded waveform.
|
np.ndarray: Loaded waveform.
|
||||||
"""
|
"""
|
||||||
if self.resample:
|
if self.resample:
|
||||||
|
# loading with resampling. It is significantly slower.
|
||||||
x, sr = librosa.load(filename, sr=self.sample_rate)
|
x, sr = librosa.load(filename, sr=self.sample_rate)
|
||||||
elif sr is None:
|
elif sr is None:
|
||||||
|
# SF is faster than librosa for loading files
|
||||||
x, sr = sf.read(filename)
|
x, sr = sf.read(filename)
|
||||||
assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr)
|
assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr)
|
||||||
else:
|
else:
|
||||||
|
@ -808,6 +828,8 @@ class AudioProcessor(object):
|
||||||
print(f" [!] File cannot be trimmed for silence - {filename}")
|
print(f" [!] File cannot be trimmed for silence - {filename}")
|
||||||
if self.do_sound_norm:
|
if self.do_sound_norm:
|
||||||
x = self.sound_norm(x)
|
x = self.sound_norm(x)
|
||||||
|
if self.do_rms_norm:
|
||||||
|
x = self.rms_volume_norm(x, self.db_level)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None:
|
def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None:
|
||||||
|
|
Loading…
Reference in New Issue