refactor(audio): remove duplicate save_wav code

2024-11-18 00:54:26 +01:00 · 2024-11-18 00:54:26 +01:00 · 8ba3233ec6
parent 5784f6705a
commit 8ba3233ec6
2 changed files with 29 additions and 18 deletions
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@ -431,7 +431,16 @@ def load_wav(*, filename: str, sample_rate: Optional[int] = None, resample: bool
    return x
-def save_wav(*, wav: np.ndarray, path: str, sample_rate: int, pipe_out=None, **kwargs) -> None:
+def save_wav(
    *,
    wav: np.ndarray,
    path: str,
    sample_rate: int,
    pipe_out=None,
    do_rms_norm: bool = False,
    db_level: float = -27.0,
    **kwargs,
 ) -> None:
    """Save float waveform to a file using Scipy.
    Args:
@ -439,8 +448,16 @@ def save_wav(*, wav: np.ndarray, path: str, sample_rate: int, pipe_out=None, **k
        path (str): Path to a output file.
        sr (int): Sampling rate used for saving to the file. Defaults to None.
        pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
        do_rms_norm (bool): Whether to apply RMS normalization
        db_level (float): Target dB level in RMS.
    """
-    wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
+    if do_rms_norm:
        if db_level is None:
            msg = "`db_level` cannot be None with `do_rms_norm=True`"
            raise ValueError(msg)
        wav_norm = rms_volume_norm(x=wav, db_level=db_level)
    else:
        wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
    wav_norm = wav_norm.astype(np.int16)
    if pipe_out:
--- a/TTS/utils/audio/processor.py
+++ b/TTS/utils/audio/processor.py
@ -1,11 +1,8 @@
 import logging
 from io import BytesIO
 from typing import Optional
 import librosa
 import numpy as np
 import scipy.io.wavfile
 import scipy.signal
 from TTS.tts.utils.helpers import StandardScaler
 from TTS.utils.audio.numpy_transforms import (
@ -21,6 +18,7 @@ from TTS.utils.audio.numpy_transforms import (
    millisec_to_length,
    preemphasis,
    rms_volume_norm,
    save_wav,
    spec_to_mel,
    stft,
    trim_silence,
@ -590,7 +588,7 @@ class AudioProcessor:
            x = self.rms_volume_norm(x, self.db_level)
        return x
-    def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out=None) -> None:
+    def save_wav(self, wav: np.ndarray, path: str, sr: Optional[int] = None, pipe_out=None) -> None:
        """Save a waveform to a file using Scipy.
        Args:
@ -599,18 +597,14 @@ class AudioProcessor:
            sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
            pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
        """
-        if self.do_rms_norm:
+        save_wav(
-            wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767
+            wav=wav,
-        else:
+            path=path,
-            wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
+            sample_rate=sr if sr else self.sample_rate,
-
+            pipe_out=pipe_out,
-        wav_norm = wav_norm.astype(np.int16)
+            do_rms_norm=self.do_rms_norm,
-        if pipe_out:
+            db_level=self.db_level,
-            wav_buffer = BytesIO()
+        )
            scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm)
            wav_buffer.seek(0)
            pipe_out.buffer.write(wav_buffer.read())
        scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm)
    def get_duration(self, filename: str) -> float:
        """Get the duration of a wav file using Librosa.