diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index c52cfe8a..9e9d4692 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -60,6 +60,12 @@ class BaseAudioConfig(Coqpit): trim_db (int): Silence threshold used for silence trimming. Defaults to 45. + do_rms_norm (bool, optional): + enable/disable RMS volume normalization when loading an audio file. Defaults to False. + + db_level (int, optional): + dB level used for rms normalization. The range is -99 to 0. Defaults to None. + power (float): Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the artifacts in the synthesized voice. Defaults to 1.5. @@ -116,6 +122,9 @@ class BaseAudioConfig(Coqpit): # silence trimming do_trim_silence: bool = True trim_db: int = 45 + # rms volume normalization + do_rms_norm: bool = False + db_level: float = None # griffin-lim params power: float = 1.5 griffin_lim_iters: int = 60 diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index d01196c4..25f93c34 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -266,6 +266,12 @@ class AudioProcessor(object): do_amp_to_db_mel (bool, optional): enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True. + do_rms_norm (bool, optional): + enable/disable RMS volume normalization when loading an audio file. Defaults to False. + + db_level (int, optional): + dB level used for rms normalization. The range is -99 to 0. Defaults to None. + stats_path (str, optional): Path to the computed stats file. Defaults to None. @@ -303,6 +309,8 @@ class AudioProcessor(object): do_sound_norm=False, do_amp_to_db_linear=True, do_amp_to_db_mel=True, + do_rms_norm=False, + db_level=None, stats_path=None, verbose=True, **_, @@ -334,6 +342,8 @@ class AudioProcessor(object): self.do_sound_norm = do_sound_norm self.do_amp_to_db_linear = do_amp_to_db_linear self.do_amp_to_db_mel = do_amp_to_db_mel + self.do_rms_norm = do_rms_norm + self.db_level = db_level self.stats_path = stats_path # setup exp_func for db to amp conversion if log_func == "np.log": @@ -726,21 +736,6 @@ class AudioProcessor(object): frame_period=1000 * self.hop_length / self.sample_rate, ) f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) - # pad = int((self.win_length / self.hop_length) / 2) - # f0 = [0.0] * pad + f0 + [0.0] * pad - # f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0) - # f0 = np.array(f0, dtype=np.float32) - - # f01, _, _ = librosa.pyin( - # x, - # fmin=65 if self.mel_fmin == 0 else self.mel_fmin, - # fmax=self.mel_fmax, - # frame_length=self.win_length, - # sr=self.sample_rate, - # fill_na=0.0, - # ) - - # spec = self.melspectrogram(x) return f0 ### Audio Processing ### @@ -783,10 +778,33 @@ class AudioProcessor(object): """ return x / abs(x).max() * 0.95 + @staticmethod + def _rms_norm(wav, db_level=-27): + r = 10 ** (db_level / 20) + a = np.sqrt((len(wav) * (r ** 2)) / np.sum(wav ** 2)) + return wav * a + + def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray: + """Normalize the volume based on RMS of the signal. + + Args: + x (np.ndarray): Raw waveform. + + Returns: + np.ndarray: RMS normalized waveform. + """ + if db_level is None: + db_level = self.db_level + assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0" + wav = self._rms_norm(x, db_level) + return wav + ### save and load ### def load_wav(self, filename: str, sr: int = None) -> np.ndarray: """Read a wav file using Librosa and optionally resample, silence trim, volume normalize. + Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. + Args: filename (str): Path to the wav file. sr (int, optional): Sampling rate for resampling. Defaults to None. @@ -795,8 +813,10 @@ class AudioProcessor(object): np.ndarray: Loaded waveform. """ if self.resample: + # loading with resampling. It is significantly slower. x, sr = librosa.load(filename, sr=self.sample_rate) elif sr is None: + # SF is faster than librosa for loading files x, sr = sf.read(filename) assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr) else: @@ -808,6 +828,8 @@ class AudioProcessor(object): print(f" [!] File cannot be trimmed for silence - {filename}") if self.do_sound_norm: x = self.sound_norm(x) + if self.do_rms_norm: + x = self.rms_volume_norm(x, self.db_level) return x def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: