From 371772c355124556531c089770d6a8b110daf160 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 9 Sep 2022 05:43:14 -0300 Subject: [PATCH] Replace pyworld by pyin (#1946) * Replace pyworld by pyin * Fix unit tests --- TTS/config/shared_configs.py | 4 +- TTS/utils/audio/numpy_transforms.py | 51 +++++++++++++++++++----- TTS/utils/audio/processor.py | 25 +++++++----- requirements.txt | 1 - tests/aux_tests/test_audio_processor.py | 2 +- tests/aux_tests/test_numpy_transforms.py | 3 +- 6 files changed, 61 insertions(+), 25 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 3ea49796..994c4579 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit): Maximum frequency of the F0 frames. Defaults to ```640```. pitch_fmin (float, optional): - Minimum frequency of the F0 frames. Defaults to ```0```. + Minimum frequency of the F0 frames. Defaults to ```1```. trim_db (int): Silence threshold used for silence trimming. Defaults to 45. @@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit): do_amp_to_db_mel: bool = True # f0 params pitch_fmax: float = 640.0 - pitch_fmin: float = 0.0 + pitch_fmin: float = 1.0 # normalization params signal_norm: bool = True min_level_db: int = -100 diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index f6f03855..952b2243 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -2,9 +2,9 @@ from typing import Tuple import librosa import numpy as np -import pyworld as pw import scipy import soundfile as sf +from librosa import pyin # For using kwargs # pylint: disable=unused-argument @@ -242,12 +242,28 @@ def compute_stft_paddings( def compute_f0( - *, x: np.ndarray = None, pitch_fmax: float = None, hop_length: int = None, sample_rate: int = None, **kwargs + *, + x: np.ndarray = None, + pitch_fmax: float = None, + pitch_fmin: float = None, + hop_length: int = None, + win_length: int = None, + sample_rate: int = None, + stft_pad_mode: str = "reflect", + center: bool = True, + **kwargs, ) -> np.ndarray: """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram. Args: x (np.ndarray): Waveform. Shape :math:`[T_wav,]` + pitch_fmax (float): Pitch max value. + pitch_fmin (float): Pitch min value. + hop_length (int): Number of frames between STFT columns. + win_length (int): STFT window length. + sample_rate (int): Audio sampling rate. + stft_pad_mode (str): Padding mode for STFT. + center (bool): Centered padding. Returns: np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length` @@ -255,20 +271,35 @@ def compute_f0( Examples: >>> WAV_FILE = filename = librosa.util.example_audio_file() >>> from TTS.config import BaseAudioConfig - >>> from TTS.utils.audio.processor import AudioProcessor >>> conf = BaseAudioConfig(pitch_fmax=8000) + >>> from TTS.utils.audio import AudioProcessor + >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1) >>> ap = AudioProcessor(**conf) - >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] + >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate] >>> pitch = ap.compute_f0(wav) """ assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`." + assert pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`." - f0, t = pw.dio( - x.astype(np.double), - fs=sample_rate, - f0_ceil=pitch_fmax, - frame_period=1000 * hop_length / sample_rate, + f0, voiced_mask, _ = pyin( + y=x.astype(np.double), + fmin=pitch_fmin, + fmax=pitch_fmax, + sr=sample_rate, + frame_length=win_length, + win_length=win_length // 2, + hop_length=hop_length, + pad_mode=stft_pad_mode, + center=center, + n_thresholds=100, + beta_parameters=(2, 18), + boltzmann_parameter=2, + resolution=0.1, + max_transition_rate=35.92, + switch_prob=0.01, + no_trough_prob=0.01, ) - f0 = pw.stonemask(x.astype(np.double), f0, t, sample_rate) + f0[~voiced_mask] = 0.0 + return f0 diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index 5a63b444..9d16474a 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -2,12 +2,12 @@ from typing import Dict, Tuple import librosa import numpy as np -import pyworld as pw import scipy.io.wavfile import scipy.signal import soundfile as sf from TTS.tts.utils.helpers import StandardScaler +from TTS.utils.audio.numpy_transforms import compute_f0 # pylint: disable=too-many-public-methods @@ -573,23 +573,28 @@ class AudioProcessor(object): >>> WAV_FILE = filename = librosa.util.example_audio_file() >>> from TTS.config import BaseAudioConfig >>> from TTS.utils.audio import AudioProcessor - >>> conf = BaseAudioConfig(pitch_fmax=8000) + >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1) >>> ap = AudioProcessor(**conf) - >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] + >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate] >>> pitch = ap.compute_f0(wav) """ assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`." + assert self.pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`." # align F0 length to the spectrogram length if len(x) % self.hop_length == 0: - x = np.pad(x, (0, self.hop_length // 2), mode="reflect") + x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode) - f0, t = pw.dio( - x.astype(np.double), - fs=self.sample_rate, - f0_ceil=self.pitch_fmax, - frame_period=1000 * self.hop_length / self.sample_rate, + f0 = compute_f0( + x=x, + pitch_fmax=self.pitch_fmax, + pitch_fmin=self.pitch_fmin, + hop_length=self.hop_length, + win_length=self.win_length, + sample_rate=self.sample_rate, + stft_pad_mode=self.stft_pad_mode, + center=True, ) - f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) + return f0 ### Audio Processing ### diff --git a/requirements.txt b/requirements.txt index ad6404be..bb9af119 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,6 @@ umap-learn==0.5.1 pandas # deps for training matplotlib -pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible # coqui stack trainer # config management diff --git a/tests/aux_tests/test_audio_processor.py b/tests/aux_tests/test_audio_processor.py index d01aeffa..5b1fa9d3 100644 --- a/tests/aux_tests/test_audio_processor.py +++ b/tests/aux_tests/test_audio_processor.py @@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") os.makedirs(OUT_PATH, exist_ok=True) -conf = BaseAudioConfig(mel_fmax=8000) +conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1) # pylint: disable=protected-access diff --git a/tests/aux_tests/test_numpy_transforms.py b/tests/aux_tests/test_numpy_transforms.py index 0c1836b9..00597a0f 100644 --- a/tests/aux_tests/test_numpy_transforms.py +++ b/tests/aux_tests/test_numpy_transforms.py @@ -31,7 +31,8 @@ class TestNumpyTransforms(unittest.TestCase): mel_fmin: int = 0 hop_length: int = 256 win_length: int = 1024 - pitch_fmax: int = 450 + pitch_fmax: int = 640 + pitch_fmin: int = 1 trim_db: int = -1 min_silence_sec: float = 0.01 gain: float = 1.0