Implement RMS volume normalization

2021-12-22 15:51:14 +00:00 · 2021-12-22 15:51:14 +00:00 · 633dcc9c56
parent 56378b12f7
commit 633dcc9c56
2 changed files with 46 additions and 15 deletions
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@ -60,6 +60,12 @@ class BaseAudioConfig(Coqpit):
        trim_db (int):
            Silence threshold used for silence trimming. Defaults to 45.

+        do_rms_norm (bool, optional):
+            enable/disable RMS volume normalization when loading an audio file. Defaults to False.
+
+        db_level (int, optional):
+            dB level used for rms normalization. The range is -99 to 0. Defaults to None.
+
        power (float):
            Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
            artifacts in the synthesized voice. Defaults to 1.5.
@ -116,6 +122,9 @@ class BaseAudioConfig(Coqpit):
    # silence trimming
    do_trim_silence: bool = True
    trim_db: int = 45
+    # rms volume normalization
+    do_rms_norm: bool = False
+    db_level: float = None
    # griffin-lim params
    power: float = 1.5
    griffin_lim_iters: int = 60
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -266,6 +266,12 @@ class AudioProcessor(object):
        do_amp_to_db_mel (bool, optional):
            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.

+        do_rms_norm (bool, optional):
+            enable/disable RMS volume normalization when loading an audio file. Defaults to False.
+
+        db_level (int, optional):
+            dB level used for rms normalization. The range is -99 to 0. Defaults to None.
+
        stats_path (str, optional):
            Path to the computed stats file. Defaults to None.

@ -303,6 +309,8 @@ class AudioProcessor(object):
        do_sound_norm=False,
        do_amp_to_db_linear=True,
        do_amp_to_db_mel=True,
+        do_rms_norm=False,
+        db_level=None,
        stats_path=None,
        verbose=True,
        **_,
@ -334,6 +342,8 @@ class AudioProcessor(object):
        self.do_sound_norm = do_sound_norm
        self.do_amp_to_db_linear = do_amp_to_db_linear
        self.do_amp_to_db_mel = do_amp_to_db_mel
+        self.do_rms_norm = do_rms_norm
+        self.db_level = db_level
        self.stats_path = stats_path
        # setup exp_func for db to amp conversion
        if log_func == "np.log":
@ -726,21 +736,6 @@ class AudioProcessor(object):
            frame_period=1000 * self.hop_length / self.sample_rate,
        )
        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
-        # pad = int((self.win_length / self.hop_length) / 2)
-        # f0 = [0.0] * pad + f0 + [0.0] * pad
-        # f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
-        # f0 = np.array(f0, dtype=np.float32)
-
-        # f01, _, _ = librosa.pyin(
-        #     x,
-        #     fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
-        #     fmax=self.mel_fmax,
-        #     frame_length=self.win_length,
-        #     sr=self.sample_rate,
-        #     fill_na=0.0,
-        # )
-
-        # spec = self.melspectrogram(x)
        return f0

    ### Audio Processing ###
@ -783,10 +778,33 @@ class AudioProcessor(object):
        """
        return x / abs(x).max() * 0.95

+    @staticmethod
+    def _rms_norm(wav, db_level=-27):
+        r = 10 ** (db_level / 20)
+        a = np.sqrt((len(wav) * (r ** 2)) / np.sum(wav ** 2))
+        return wav * a
+
+    def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray:
+        """Normalize the volume based on RMS of the signal.
+
+        Args:
+            x (np.ndarray): Raw waveform.
+
+        Returns:
+            np.ndarray: RMS normalized waveform.
+        """
+        if db_level is None:
+            db_level = self.db_level
+        assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0"
+        wav = self._rms_norm(x, db_level)
+        return wav
+
    ### save and load ###
    def load_wav(self, filename: str, sr: int = None) -> np.ndarray:
        """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.

+        Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
+
        Args:
            filename (str): Path to the wav file.
            sr (int, optional): Sampling rate for resampling. Defaults to None.
@ -795,8 +813,10 @@ class AudioProcessor(object):
            np.ndarray: Loaded waveform.
        """
        if self.resample:
+            # loading with resampling. It is significantly slower.
            x, sr = librosa.load(filename, sr=self.sample_rate)
        elif sr is None:
+            # SF is faster than librosa for loading files
            x, sr = sf.read(filename)
            assert self.sample_rate == sr, "%s vs %s" % (self.sample_rate, sr)
        else:
@ -808,6 +828,8 @@ class AudioProcessor(object):
                print(f" [!] File cannot be trimmed for silence - {filename}")
        if self.do_sound_norm:
            x = self.sound_norm(x)
+        if self.do_rms_norm:
+            x = self.rms_volume_norm(x, self.db_level)
        return x

    def save_wav(self, wav: np.ndarray, path: str, sr: int = None) -> None: