Merge pull request #166 from idiap/error-messages

Automatically convert audio to mono, add more helpful error messages
This commit is contained in:
Enno Hermann 2024-11-20 11:21:28 +01:00 committed by GitHub
commit 312593e119
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 132 additions and 132 deletions

View File

@ -166,6 +166,11 @@ def load_attention_mask_meta_data(metafile_path):
def _get_formatter_by_name(name): def _get_formatter_by_name(name):
"""Returns the respective preprocessing function.""" """Returns the respective preprocessing function."""
thismodule = sys.modules[__name__] thismodule = sys.modules[__name__]
if not hasattr(thismodule, name.lower()):
msg = (
f"{name} formatter not found. If it is a custom formatter, pass the function to load_tts_samples() instead."
)
raise ValueError(msg)
return getattr(thismodule, name.lower()) return getattr(thismodule, name.lower())

View File

@ -779,6 +779,12 @@ class Xtts(BaseTTS):
if os.path.exists(vocab_path): if os.path.exists(vocab_path):
self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path) self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path)
else:
msg = (
f"`vocab.json` file not found in `{checkpoint_dir}`. Move the file there or "
"specify alternative path in `model_args.tokenizer_file` in `config.json`"
)
raise FileNotFoundError(msg)
self.init_models() self.init_models()

View File

@ -1,6 +1,6 @@
import logging import logging
from io import BytesIO from io import BytesIO
from typing import Tuple from typing import Optional
import librosa import librosa
import numpy as np import numpy as np
@ -16,11 +16,11 @@ logger = logging.getLogger(__name__)
def build_mel_basis( def build_mel_basis(
*, *,
sample_rate: int = None, sample_rate: int,
fft_size: int = None, fft_size: int,
num_mels: int = None, num_mels: int,
mel_fmax: int = None, mel_fmin: int,
mel_fmin: int = None, mel_fmax: Optional[int] = None,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
"""Build melspectrogram basis. """Build melspectrogram basis.
@ -34,9 +34,7 @@ def build_mel_basis(
return librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=mel_fmin, fmax=mel_fmax) return librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=mel_fmin, fmax=mel_fmax)
def millisec_to_length( def millisec_to_length(*, frame_length_ms: float, frame_shift_ms: float, sample_rate: int, **kwargs) -> tuple[int, int]:
*, frame_length_ms: int = None, frame_shift_ms: int = None, sample_rate: int = None, **kwargs
) -> Tuple[int, int]:
"""Compute hop and window length from milliseconds. """Compute hop and window length from milliseconds.
Returns: Returns:
@ -61,7 +59,7 @@ def _exp(x, base):
return np.exp(x) return np.exp(x)
def amp_to_db(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray: def amp_to_db(*, x: np.ndarray, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray:
"""Convert amplitude values to decibels. """Convert amplitude values to decibels.
Args: Args:
@ -77,7 +75,7 @@ def amp_to_db(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs
# pylint: disable=no-self-use # pylint: disable=no-self-use
def db_to_amp(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray: def db_to_amp(*, x: np.ndarray, gain: float = 1, base: float = 10, **kwargs) -> np.ndarray:
"""Convert decibels spectrogram to amplitude spectrogram. """Convert decibels spectrogram to amplitude spectrogram.
Args: Args:
@ -104,18 +102,20 @@ def preemphasis(*, x: np.ndarray, coef: float = 0.97, **kwargs) -> np.ndarray:
np.ndarray: Decorrelated audio signal. np.ndarray: Decorrelated audio signal.
""" """
if coef == 0: if coef == 0:
raise RuntimeError(" [!] Preemphasis is set 0.0.") msg = " [!] Preemphasis is set 0.0."
raise RuntimeError(msg)
return scipy.signal.lfilter([1, -coef], [1], x) return scipy.signal.lfilter([1, -coef], [1], x)
def deemphasis(*, x: np.ndarray = None, coef: float = 0.97, **kwargs) -> np.ndarray: def deemphasis(*, x: np.ndarray, coef: float = 0.97, **kwargs) -> np.ndarray:
"""Reverse pre-emphasis.""" """Reverse pre-emphasis."""
if coef == 0: if coef == 0:
raise RuntimeError(" [!] Preemphasis is set 0.0.") msg = " [!] Preemphasis is set 0.0."
raise ValueError(msg)
return scipy.signal.lfilter([1], [1, -coef], x) return scipy.signal.lfilter([1], [1, -coef], x)
def spec_to_mel(*, spec: np.ndarray, mel_basis: np.ndarray = None, **kwargs) -> np.ndarray: def spec_to_mel(*, spec: np.ndarray, mel_basis: np.ndarray, **kwargs) -> np.ndarray:
"""Convert a full scale linear spectrogram output of a network to a melspectrogram. """Convert a full scale linear spectrogram output of a network to a melspectrogram.
Args: Args:
@ -130,14 +130,14 @@ def spec_to_mel(*, spec: np.ndarray, mel_basis: np.ndarray = None, **kwargs) ->
return np.dot(mel_basis, spec) return np.dot(mel_basis, spec)
def mel_to_spec(*, mel: np.ndarray = None, mel_basis: np.ndarray = None, **kwargs) -> np.ndarray: def mel_to_spec(*, mel: np.ndarray, mel_basis: np.ndarray, **kwargs) -> np.ndarray:
"""Convert a melspectrogram to full scale spectrogram.""" """Convert a melspectrogram to full scale spectrogram."""
assert (mel < 0).sum() == 0, " [!] Input values must be non-negative." assert (mel < 0).sum() == 0, " [!] Input values must be non-negative."
inv_mel_basis = np.linalg.pinv(mel_basis) inv_mel_basis = np.linalg.pinv(mel_basis)
return np.maximum(1e-10, np.dot(inv_mel_basis, mel)) return np.maximum(1e-10, np.dot(inv_mel_basis, mel))
def wav_to_spec(*, wav: np.ndarray = None, **kwargs) -> np.ndarray: def wav_to_spec(*, wav: np.ndarray, **kwargs) -> np.ndarray:
"""Compute a spectrogram from a waveform. """Compute a spectrogram from a waveform.
Args: Args:
@ -151,7 +151,7 @@ def wav_to_spec(*, wav: np.ndarray = None, **kwargs) -> np.ndarray:
return S.astype(np.float32) return S.astype(np.float32)
def wav_to_mel(*, wav: np.ndarray = None, mel_basis=None, **kwargs) -> np.ndarray: def wav_to_mel(*, wav: np.ndarray, mel_basis: np.ndarray, **kwargs) -> np.ndarray:
"""Compute a melspectrogram from a waveform.""" """Compute a melspectrogram from a waveform."""
D = stft(y=wav, **kwargs) D = stft(y=wav, **kwargs)
S = spec_to_mel(spec=np.abs(D), mel_basis=mel_basis, **kwargs) S = spec_to_mel(spec=np.abs(D), mel_basis=mel_basis, **kwargs)
@ -164,20 +164,20 @@ def spec_to_wav(*, spec: np.ndarray, power: float = 1.5, **kwargs) -> np.ndarray
return griffin_lim(spec=S**power, **kwargs) return griffin_lim(spec=S**power, **kwargs)
def mel_to_wav(*, mel: np.ndarray = None, power: float = 1.5, **kwargs) -> np.ndarray: def mel_to_wav(*, mel: np.ndarray, mel_basis: np.ndarray, power: float = 1.5, **kwargs) -> np.ndarray:
"""Convert a melspectrogram to a waveform using Griffi-Lim vocoder.""" """Convert a melspectrogram to a waveform using Griffi-Lim vocoder."""
S = mel.copy() S = mel.copy()
S = mel_to_spec(mel=S, mel_basis=kwargs["mel_basis"]) # Convert back to linear S = mel_to_spec(mel=S, mel_basis=mel_basis) # Convert back to linear
return griffin_lim(spec=S**power, **kwargs) return griffin_lim(spec=S**power, **kwargs)
### STFT and ISTFT ### ### STFT and ISTFT ###
def stft( def stft(
*, *,
y: np.ndarray = None, y: np.ndarray,
fft_size: int = None, fft_size: int,
hop_length: int = None, hop_length: Optional[int] = None,
win_length: int = None, win_length: Optional[int] = None,
pad_mode: str = "reflect", pad_mode: str = "reflect",
window: str = "hann", window: str = "hann",
center: bool = True, center: bool = True,
@ -203,9 +203,9 @@ def stft(
def istft( def istft(
*, *,
y: np.ndarray = None, y: np.ndarray,
hop_length: int = None, hop_length: Optional[int] = None,
win_length: int = None, win_length: Optional[int] = None,
window: str = "hann", window: str = "hann",
center: bool = True, center: bool = True,
**kwargs, **kwargs,
@ -220,7 +220,7 @@ def istft(
return librosa.istft(y, hop_length=hop_length, win_length=win_length, center=center, window=window) return librosa.istft(y, hop_length=hop_length, win_length=win_length, center=center, window=window)
def griffin_lim(*, spec: np.ndarray = None, num_iter=60, **kwargs) -> np.ndarray: def griffin_lim(*, spec: np.ndarray, num_iter=60, **kwargs) -> np.ndarray:
angles = np.exp(2j * np.pi * np.random.rand(*spec.shape)) angles = np.exp(2j * np.pi * np.random.rand(*spec.shape))
S_complex = np.abs(spec).astype(complex) S_complex = np.abs(spec).astype(complex)
y = istft(y=S_complex * angles, **kwargs) y = istft(y=S_complex * angles, **kwargs)
@ -233,11 +233,11 @@ def griffin_lim(*, spec: np.ndarray = None, num_iter=60, **kwargs) -> np.ndarray
return y return y
def compute_stft_paddings( def compute_stft_paddings(*, x: np.ndarray, hop_length: int, pad_two_sides: bool = False, **kwargs) -> tuple[int, int]:
*, x: np.ndarray = None, hop_length: int = None, pad_two_sides: bool = False, **kwargs """Compute paddings used by Librosa's STFT.
) -> Tuple[int, int]:
"""Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding Compute right padding (final frame) or both sides padding (first and final frames).
(first and final frames)""" """
pad = (x.shape[0] // hop_length + 1) * hop_length - x.shape[0] pad = (x.shape[0] // hop_length + 1) * hop_length - x.shape[0]
if not pad_two_sides: if not pad_two_sides:
return 0, pad return 0, pad
@ -246,12 +246,12 @@ def compute_stft_paddings(
def compute_f0( def compute_f0(
*, *,
x: np.ndarray = None, x: np.ndarray,
pitch_fmax: float = None, pitch_fmax: Optional[float] = None,
pitch_fmin: float = None, pitch_fmin: Optional[float] = None,
hop_length: int = None, hop_length: int,
win_length: int = None, win_length: int,
sample_rate: int = None, sample_rate: int,
stft_pad_mode: str = "reflect", stft_pad_mode: str = "reflect",
center: bool = True, center: bool = True,
**kwargs, **kwargs,
@ -323,19 +323,18 @@ def compute_energy(y: np.ndarray, **kwargs) -> np.ndarray:
""" """
x = stft(y=y, **kwargs) x = stft(y=y, **kwargs)
mag, _ = magphase(x) mag, _ = magphase(x)
energy = np.sqrt(np.sum(mag**2, axis=0)) return np.sqrt(np.sum(mag**2, axis=0))
return energy
### Audio Processing ### ### Audio Processing ###
def find_endpoint( def find_endpoint(
*, *,
wav: np.ndarray = None, wav: np.ndarray,
trim_db: float = -40, trim_db: float = -40,
sample_rate: int = None, sample_rate: int,
min_silence_sec=0.8, min_silence_sec: float = 0.8,
gain: float = None, gain: float = 1,
base: int = None, base: float = 10,
**kwargs, **kwargs,
) -> int: ) -> int:
"""Find the last point without silence at the end of a audio signal. """Find the last point without silence at the end of a audio signal.
@ -344,8 +343,8 @@ def find_endpoint(
wav (np.ndarray): Audio signal. wav (np.ndarray): Audio signal.
threshold_db (int, optional): Silence threshold in decibels. Defaults to -40. threshold_db (int, optional): Silence threshold in decibels. Defaults to -40.
min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8. min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8.
gian (float, optional): Gain to be used to convert trim_db to trim_amp. Defaults to None. gain (float, optional): Gain factor to be used to convert trim_db to trim_amp. Defaults to 1.
base (int, optional): Base of the logarithm used to convert trim_db to trim_amp. Defaults to 10. base (float, optional): Base of the logarithm used to convert trim_db to trim_amp. Defaults to 10.
Returns: Returns:
int: Last point without silence. int: Last point without silence.
@ -361,20 +360,20 @@ def find_endpoint(
def trim_silence( def trim_silence(
*, *,
wav: np.ndarray = None, wav: np.ndarray,
sample_rate: int = None, sample_rate: int,
trim_db: float = None, trim_db: float = 60,
win_length: int = None, win_length: int,
hop_length: int = None, hop_length: int,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
"""Trim silent parts with a threshold and 0.01 sec margin""" """Trim silent parts with a threshold and 0.01 sec margin."""
margin = int(sample_rate * 0.01) margin = int(sample_rate * 0.01)
wav = wav[margin:-margin] wav = wav[margin:-margin]
return librosa.effects.trim(wav, top_db=trim_db, frame_length=win_length, hop_length=hop_length)[0] return librosa.effects.trim(wav, top_db=trim_db, frame_length=win_length, hop_length=hop_length)[0]
def volume_norm(*, x: np.ndarray = None, coef: float = 0.95, **kwargs) -> np.ndarray: def volume_norm(*, x: np.ndarray, coef: float = 0.95, **kwargs) -> np.ndarray:
"""Normalize the volume of an audio signal. """Normalize the volume of an audio signal.
Args: Args:
@ -387,7 +386,7 @@ def volume_norm(*, x: np.ndarray = None, coef: float = 0.95, **kwargs) -> np.nda
return x / abs(x).max() * coef return x / abs(x).max() * coef
def rms_norm(*, wav: np.ndarray = None, db_level: float = -27.0, **kwargs) -> np.ndarray: def rms_norm(*, wav: np.ndarray, db_level: float = -27.0, **kwargs) -> np.ndarray:
r = 10 ** (db_level / 20) r = 10 ** (db_level / 20)
a = np.sqrt((len(wav) * (r**2)) / np.sum(wav**2)) a = np.sqrt((len(wav) * (r**2)) / np.sum(wav**2))
return wav * a return wav * a
@ -404,11 +403,10 @@ def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.n
np.ndarray: RMS normalized waveform. np.ndarray: RMS normalized waveform.
""" """
assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0" assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0"
wav = rms_norm(wav=x, db_level=db_level) return rms_norm(wav=x, db_level=db_level)
return wav
def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, **kwargs) -> np.ndarray: def load_wav(*, filename: str, sample_rate: Optional[int] = None, resample: bool = False, **kwargs) -> np.ndarray:
"""Read a wav file using Librosa and optionally resample, silence trim, volume normalize. """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
@ -427,19 +425,39 @@ def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False,
else: else:
# SF is faster than librosa for loading files # SF is faster than librosa for loading files
x, _ = sf.read(filename) x, _ = sf.read(filename)
if x.ndim != 1:
logger.warning("Found multi-channel audio. Converting to mono: %s", filename)
x = librosa.to_mono(x)
return x return x
def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, pipe_out=None, **kwargs) -> None: def save_wav(
*,
wav: np.ndarray,
path: str,
sample_rate: int,
pipe_out=None,
do_rms_norm: bool = False,
db_level: float = -27.0,
**kwargs,
) -> None:
"""Save float waveform to a file using Scipy. """Save float waveform to a file using Scipy.
Args: Args:
wav (np.ndarray): Waveform with float values in range [-1, 1] to save. wav (np.ndarray): Waveform with float values in range [-1, 1] to save.
path (str): Path to a output file. path (str): Path to a output file.
sr (int, optional): Sampling rate used for saving to the file. Defaults to None. sr (int): Sampling rate used for saving to the file. Defaults to None.
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
do_rms_norm (bool): Whether to apply RMS normalization
db_level (float): Target dB level in RMS.
""" """
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) if do_rms_norm:
if db_level is None:
msg = "`db_level` cannot be None with `do_rms_norm=True`"
raise ValueError(msg)
wav_norm = rms_volume_norm(x=wav, db_level=db_level)
else:
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
wav_norm = wav_norm.astype(np.int16) wav_norm = wav_norm.astype(np.int16)
if pipe_out: if pipe_out:
@ -462,8 +480,7 @@ def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray:
def mulaw_decode(*, wav, mulaw_qc: int, **kwargs) -> np.ndarray: def mulaw_decode(*, wav, mulaw_qc: int, **kwargs) -> np.ndarray:
"""Recovers waveform from quantized values.""" """Recovers waveform from quantized values."""
mu = 2**mulaw_qc - 1 mu = 2**mulaw_qc - 1
x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1) return np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
return x
def encode_16bits(*, x: np.ndarray, **kwargs) -> np.ndarray: def encode_16bits(*, x: np.ndarray, **kwargs) -> np.ndarray:

View File

@ -1,11 +1,8 @@
import logging import logging
from io import BytesIO from typing import Optional
from typing import Dict, Tuple
import librosa import librosa
import numpy as np import numpy as np
import scipy.io.wavfile
import scipy.signal
from TTS.tts.utils.helpers import StandardScaler from TTS.tts.utils.helpers import StandardScaler
from TTS.utils.audio.numpy_transforms import ( from TTS.utils.audio.numpy_transforms import (
@ -21,6 +18,7 @@ from TTS.utils.audio.numpy_transforms import (
millisec_to_length, millisec_to_length,
preemphasis, preemphasis,
rms_volume_norm, rms_volume_norm,
save_wav,
spec_to_mel, spec_to_mel,
stft, stft,
trim_silence, trim_silence,
@ -32,7 +30,7 @@ logger = logging.getLogger(__name__)
# pylint: disable=too-many-public-methods # pylint: disable=too-many-public-methods
class AudioProcessor(object): class AudioProcessor:
"""Audio Processor for TTS. """Audio Processor for TTS.
Note: Note:
@ -172,7 +170,7 @@ class AudioProcessor(object):
db_level=None, db_level=None,
stats_path=None, stats_path=None,
**_, **_,
): ) -> None:
# setup class attributed # setup class attributed
self.sample_rate = sample_rate self.sample_rate = sample_rate
self.resample = resample self.resample = resample
@ -210,7 +208,8 @@ class AudioProcessor(object):
elif log_func == "np.log10": elif log_func == "np.log10":
self.base = 10 self.base = 10
else: else:
raise ValueError(" [!] unknown `log_func` value.") msg = " [!] unknown `log_func` value."
raise ValueError(msg)
# setup stft parameters # setup stft parameters
if hop_length is None: if hop_length is None:
# compute stft parameters from given time values # compute stft parameters from given time values
@ -254,7 +253,7 @@ class AudioProcessor(object):
### normalization ### ### normalization ###
def normalize(self, S: np.ndarray) -> np.ndarray: def normalize(self, S: np.ndarray) -> np.ndarray:
"""Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]` """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]`.
Args: Args:
S (np.ndarray): Spectrogram to normalize. S (np.ndarray): Spectrogram to normalize.
@ -272,10 +271,10 @@ class AudioProcessor(object):
if hasattr(self, "mel_scaler"): if hasattr(self, "mel_scaler"):
if S.shape[0] == self.num_mels: if S.shape[0] == self.num_mels:
return self.mel_scaler.transform(S.T).T return self.mel_scaler.transform(S.T).T
elif S.shape[0] == self.fft_size / 2: if S.shape[0] == self.fft_size / 2:
return self.linear_scaler.transform(S.T).T return self.linear_scaler.transform(S.T).T
else: msg = " [!] Mean-Var stats does not match the given feature dimensions."
raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") raise RuntimeError(msg)
# range normalization # range normalization
S -= self.ref_level_db # discard certain range of DB assuming it is air noise S -= self.ref_level_db # discard certain range of DB assuming it is air noise
S_norm = (S - self.min_level_db) / (-self.min_level_db) S_norm = (S - self.min_level_db) / (-self.min_level_db)
@ -286,13 +285,11 @@ class AudioProcessor(object):
S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type
) )
return S_norm return S_norm
else: S_norm = self.max_norm * S_norm
S_norm = self.max_norm * S_norm if self.clip_norm:
if self.clip_norm: S_norm = np.clip(S_norm, 0, self.max_norm)
S_norm = np.clip(S_norm, 0, self.max_norm) return S_norm
return S_norm return S
else:
return S
def denormalize(self, S: np.ndarray) -> np.ndarray: def denormalize(self, S: np.ndarray) -> np.ndarray:
"""Denormalize spectrogram values. """Denormalize spectrogram values.
@ -313,10 +310,10 @@ class AudioProcessor(object):
if hasattr(self, "mel_scaler"): if hasattr(self, "mel_scaler"):
if S_denorm.shape[0] == self.num_mels: if S_denorm.shape[0] == self.num_mels:
return self.mel_scaler.inverse_transform(S_denorm.T).T return self.mel_scaler.inverse_transform(S_denorm.T).T
elif S_denorm.shape[0] == self.fft_size / 2: if S_denorm.shape[0] == self.fft_size / 2:
return self.linear_scaler.inverse_transform(S_denorm.T).T return self.linear_scaler.inverse_transform(S_denorm.T).T
else: msg = " [!] Mean-Var stats does not match the given feature dimensions."
raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") raise RuntimeError(msg)
if self.symmetric_norm: if self.symmetric_norm:
if self.clip_norm: if self.clip_norm:
S_denorm = np.clip( S_denorm = np.clip(
@ -324,16 +321,14 @@ class AudioProcessor(object):
) )
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
return S_denorm + self.ref_level_db return S_denorm + self.ref_level_db
else: if self.clip_norm:
if self.clip_norm: S_denorm = np.clip(S_denorm, 0, self.max_norm)
S_denorm = np.clip(S_denorm, 0, self.max_norm) S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db
S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db return S_denorm + self.ref_level_db
return S_denorm + self.ref_level_db return S_denorm
else:
return S_denorm
### Mean-STD scaling ### ### Mean-STD scaling ###
def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]: def load_stats(self, stats_path: str) -> tuple[np.array, np.array, np.array, np.array, dict]:
"""Loading mean and variance statistics from a `npy` file. """Loading mean and variance statistics from a `npy` file.
Args: Args:
@ -351,7 +346,7 @@ class AudioProcessor(object):
stats_config = stats["audio_config"] stats_config = stats["audio_config"]
# check all audio parameters used for computing stats # check all audio parameters used for computing stats
skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"] skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"]
for key in stats_config.keys(): for key in stats_config:
if key in skip_parameters: if key in skip_parameters:
continue continue
if key not in ["sample_rate", "trim_db"]: if key not in ["sample_rate", "trim_db"]:
@ -415,10 +410,7 @@ class AudioProcessor(object):
win_length=self.win_length, win_length=self.win_length,
pad_mode=self.stft_pad_mode, pad_mode=self.stft_pad_mode,
) )
if self.do_amp_to_db_linear: S = amp_to_db(x=np.abs(D), gain=self.spec_gain, base=self.base) if self.do_amp_to_db_linear else np.abs(D)
S = amp_to_db(x=np.abs(D), gain=self.spec_gain, base=self.base)
else:
S = np.abs(D)
return self.normalize(S).astype(np.float32) return self.normalize(S).astype(np.float32)
def melspectrogram(self, y: np.ndarray) -> np.ndarray: def melspectrogram(self, y: np.ndarray) -> np.ndarray:
@ -467,8 +459,7 @@ class AudioProcessor(object):
S = db_to_amp(x=S, gain=self.spec_gain, base=self.base) S = db_to_amp(x=S, gain=self.spec_gain, base=self.base)
S = spec_to_mel(spec=np.abs(S), mel_basis=self.mel_basis) S = spec_to_mel(spec=np.abs(S), mel_basis=self.mel_basis)
S = amp_to_db(x=S, gain=self.spec_gain, base=self.base) S = amp_to_db(x=S, gain=self.spec_gain, base=self.base)
mel = self.normalize(S) return self.normalize(S)
return mel
def _griffin_lim(self, S): def _griffin_lim(self, S):
return griffin_lim( return griffin_lim(
@ -502,7 +493,7 @@ class AudioProcessor(object):
if len(x) % self.hop_length == 0: if len(x) % self.hop_length == 0:
x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode) x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode)
f0 = compute_f0( return compute_f0(
x=x, x=x,
pitch_fmax=self.pitch_fmax, pitch_fmax=self.pitch_fmax,
pitch_fmin=self.pitch_fmin, pitch_fmin=self.pitch_fmin,
@ -513,8 +504,6 @@ class AudioProcessor(object):
center=True, center=True,
) )
return f0
### Audio Processing ### ### Audio Processing ###
def find_endpoint(self, wav: np.ndarray, min_silence_sec=0.8) -> int: def find_endpoint(self, wav: np.ndarray, min_silence_sec=0.8) -> int:
"""Find the last point without silence at the end of a audio signal. """Find the last point without silence at the end of a audio signal.
@ -537,7 +526,7 @@ class AudioProcessor(object):
) )
def trim_silence(self, wav): def trim_silence(self, wav):
"""Trim silent parts with a threshold and 0.01 sec margin""" """Trim silent parts with a threshold and 0.01 sec margin."""
return trim_silence( return trim_silence(
wav=wav, wav=wav,
sample_rate=self.sample_rate, sample_rate=self.sample_rate,
@ -558,21 +547,8 @@ class AudioProcessor(object):
""" """
return volume_norm(x=x) return volume_norm(x=x)
def rms_volume_norm(self, x: np.ndarray, db_level: float = None) -> np.ndarray:
"""Normalize the volume based on RMS of the signal.
Args:
x (np.ndarray): Raw waveform.
Returns:
np.ndarray: RMS normalized waveform.
"""
if db_level is None:
db_level = self.db_level
return rms_volume_norm(x=x, db_level=db_level)
### save and load ### ### save and load ###
def load_wav(self, filename: str, sr: int = None) -> np.ndarray: def load_wav(self, filename: str, sr: Optional[int] = None) -> np.ndarray:
"""Read a wav file using Librosa and optionally resample, silence trim, volume normalize. """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before. Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
@ -596,10 +572,10 @@ class AudioProcessor(object):
if self.do_sound_norm: if self.do_sound_norm:
x = self.sound_norm(x) x = self.sound_norm(x)
if self.do_rms_norm: if self.do_rms_norm:
x = self.rms_volume_norm(x, self.db_level) x = rms_volume_norm(x=x, db_level=self.db_level)
return x return x
def save_wav(self, wav: np.ndarray, path: str, sr: int = None, pipe_out=None) -> None: def save_wav(self, wav: np.ndarray, path: str, sr: Optional[int] = None, pipe_out=None) -> None:
"""Save a waveform to a file using Scipy. """Save a waveform to a file using Scipy.
Args: Args:
@ -608,18 +584,14 @@ class AudioProcessor(object):
sr (int, optional): Sampling rate used for saving to the file. Defaults to None. sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
""" """
if self.do_rms_norm: save_wav(
wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767 wav=wav,
else: path=path,
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) sample_rate=sr if sr else self.sample_rate,
pipe_out=pipe_out,
wav_norm = wav_norm.astype(np.int16) do_rms_norm=self.do_rms_norm,
if pipe_out: db_level=self.db_level,
wav_buffer = BytesIO() )
scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm)
wav_buffer.seek(0)
pipe_out.buffer.write(wav_buffer.read())
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm)
def get_duration(self, filename: str) -> float: def get_duration(self, filename: str) -> float:
"""Get the duration of a wav file using Librosa. """Get the duration of a wav file using Librosa.