mirror of https://github.com/coqui-ai/TTS.git
66 lines
2.4 KiB
Python
66 lines
2.4 KiB
Python
import struct
|
|
from pathlib import Path
|
|
from typing import Optional, Union
|
|
|
|
# import webrtcvad
|
|
import librosa
|
|
import numpy as np
|
|
from scipy.ndimage.morphology import binary_dilation
|
|
|
|
from TTS.vc.modules.freevc.speaker_encoder.hparams import *
|
|
|
|
int16_max = (2**15) - 1
|
|
|
|
|
|
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], source_sr: Optional[int] = None):
|
|
"""
|
|
Applies the preprocessing operations used in training the Speaker Encoder to a waveform
|
|
either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
|
|
|
|
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
|
|
just .wav), either the waveform as a numpy array of floats.
|
|
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
|
|
preprocessing. After preprocessing, the waveform's sampling rate will match the data
|
|
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
|
|
this argument will be ignored.
|
|
"""
|
|
# Load the wav from disk if needed
|
|
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
|
wav, source_sr = librosa.load(fpath_or_wav, sr=None)
|
|
else:
|
|
wav = fpath_or_wav
|
|
|
|
# Resample the wav if needed
|
|
if source_sr is not None and source_sr != sampling_rate:
|
|
wav = librosa.resample(wav, source_sr, sampling_rate)
|
|
|
|
# Apply the preprocessing: normalize volume and shorten long silences
|
|
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
|
|
wav = trim_long_silences(wav)
|
|
|
|
return wav
|
|
|
|
|
|
def wav_to_mel_spectrogram(wav):
|
|
"""
|
|
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
|
|
Note: this not a log-mel spectrogram.
|
|
"""
|
|
frames = librosa.feature.melspectrogram(
|
|
y=wav,
|
|
sr=sampling_rate,
|
|
n_fft=int(sampling_rate * mel_window_length / 1000),
|
|
hop_length=int(sampling_rate * mel_window_step / 1000),
|
|
n_mels=mel_n_channels,
|
|
)
|
|
return frames.astype(np.float32).T
|
|
|
|
|
|
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
|
|
if increase_only and decrease_only:
|
|
raise ValueError("Both increase only and decrease only are set")
|
|
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav**2))
|
|
if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
|
|
return wav
|
|
return wav * (10 ** (dBFS_change / 20))
|