From 371772c355124556531c089770d6a8b110daf160 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Fri, 9 Sep 2022 05:43:14 -0300
Subject: [PATCH] Replace pyworld by pyin (#1946)

* Replace pyworld by pyin

* Fix unit tests
---
 TTS/config/shared_configs.py             |  4 +-
 TTS/utils/audio/numpy_transforms.py      | 51 +++++++++++++++++++-----
 TTS/utils/audio/processor.py             | 25 +++++++-----
 requirements.txt                         |  1 -
 tests/aux_tests/test_audio_processor.py  |  2 +-
 tests/aux_tests/test_numpy_transforms.py |  3 +-
 6 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py
index 3ea49796..994c4579 100644
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit):
             Maximum frequency of the F0 frames. Defaults to ```640```.
 
         pitch_fmin (float, optional):
-            Minimum frequency of the F0 frames. Defaults to ```0```.
+            Minimum frequency of the F0 frames. Defaults to ```1```.
 
         trim_db (int):
             Silence threshold used for silence trimming. Defaults to 45.
@@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit):
     do_amp_to_db_mel: bool = True
     # f0 params
     pitch_fmax: float = 640.0
-    pitch_fmin: float = 0.0
+    pitch_fmin: float = 1.0
     # normalization params
     signal_norm: bool = True
     min_level_db: int = -100
diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py
index f6f03855..952b2243 100644
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@@ -2,9 +2,9 @@ from typing import Tuple
 
 import librosa
 import numpy as np
-import pyworld as pw
 import scipy
 import soundfile as sf
+from librosa import pyin
 
 # For using kwargs
 # pylint: disable=unused-argument
@@ -242,12 +242,28 @@ def compute_stft_paddings(
 
 
 def compute_f0(
-    *, x: np.ndarray = None, pitch_fmax: float = None, hop_length: int = None, sample_rate: int = None, **kwargs
+    *,
+    x: np.ndarray = None,
+    pitch_fmax: float = None,
+    pitch_fmin: float = None,
+    hop_length: int = None,
+    win_length: int = None,
+    sample_rate: int = None,
+    stft_pad_mode: str = "reflect",
+    center: bool = True,
+    **kwargs,
 ) -> np.ndarray:
     """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
 
     Args:
         x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
+        pitch_fmax (float): Pitch max value.
+        pitch_fmin (float): Pitch min value.
+        hop_length (int): Number of frames between STFT columns.
+        win_length (int): STFT window length.
+        sample_rate (int): Audio sampling rate.
+        stft_pad_mode (str): Padding mode for STFT.
+        center (bool): Centered padding.
 
     Returns:
         np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`
@@ -255,20 +271,35 @@ def compute_f0(
     Examples:
         >>> WAV_FILE = filename = librosa.util.example_audio_file()
         >>> from TTS.config import BaseAudioConfig
-        >>> from TTS.utils.audio.processor import AudioProcessor        >>> conf = BaseAudioConfig(pitch_fmax=8000)
+        >>> from TTS.utils.audio import AudioProcessor
+        >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
         >>> ap = AudioProcessor(**conf)
-        >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+        >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
         >>> pitch = ap.compute_f0(wav)
     """
     assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
+    assert pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
 
-    f0, t = pw.dio(
-        x.astype(np.double),
-        fs=sample_rate,
-        f0_ceil=pitch_fmax,
-        frame_period=1000 * hop_length / sample_rate,
+    f0, voiced_mask, _ = pyin(
+        y=x.astype(np.double),
+        fmin=pitch_fmin,
+        fmax=pitch_fmax,
+        sr=sample_rate,
+        frame_length=win_length,
+        win_length=win_length // 2,
+        hop_length=hop_length,
+        pad_mode=stft_pad_mode,
+        center=center,
+        n_thresholds=100,
+        beta_parameters=(2, 18),
+        boltzmann_parameter=2,
+        resolution=0.1,
+        max_transition_rate=35.92,
+        switch_prob=0.01,
+        no_trough_prob=0.01,
     )
-    f0 = pw.stonemask(x.astype(np.double), f0, t, sample_rate)
+    f0[~voiced_mask] = 0.0
+
     return f0
 
 
diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py
index 5a63b444..9d16474a 100644
--- a/TTS/utils/audio/processor.py
+++ b/TTS/utils/audio/processor.py
@@ -2,12 +2,12 @@ from typing import Dict, Tuple
 
 import librosa
 import numpy as np
-import pyworld as pw
 import scipy.io.wavfile
 import scipy.signal
 import soundfile as sf
 
 from TTS.tts.utils.helpers import StandardScaler
+from TTS.utils.audio.numpy_transforms import compute_f0
 
 # pylint: disable=too-many-public-methods
 
@@ -573,23 +573,28 @@ class AudioProcessor(object):
             >>> WAV_FILE = filename = librosa.util.example_audio_file()
             >>> from TTS.config import BaseAudioConfig
             >>> from TTS.utils.audio import AudioProcessor
-            >>> conf = BaseAudioConfig(pitch_fmax=8000)
+            >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
             >>> ap = AudioProcessor(**conf)
-            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+            >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
             >>> pitch = ap.compute_f0(wav)
         """
         assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
+        assert self.pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
         # align F0 length to the spectrogram length
         if len(x) % self.hop_length == 0:
-            x = np.pad(x, (0, self.hop_length // 2), mode="reflect")
+            x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode)
 
-        f0, t = pw.dio(
-            x.astype(np.double),
-            fs=self.sample_rate,
-            f0_ceil=self.pitch_fmax,
-            frame_period=1000 * self.hop_length / self.sample_rate,
+        f0 = compute_f0(
+            x=x,
+            pitch_fmax=self.pitch_fmax,
+            pitch_fmin=self.pitch_fmin,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            sample_rate=self.sample_rate,
+            stft_pad_mode=self.stft_pad_mode,
+            center=True,
         )
-        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
+
         return f0
 
     ### Audio Processing ###
diff --git a/requirements.txt b/requirements.txt
index ad6404be..bb9af119 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,6 @@ umap-learn==0.5.1
 pandas
 # deps for training
 matplotlib
-pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
 # coqui stack
 trainer
 # config management
diff --git a/tests/aux_tests/test_audio_processor.py b/tests/aux_tests/test_audio_processor.py
index d01aeffa..5b1fa9d3 100644
--- a/tests/aux_tests/test_audio_processor.py
+++ b/tests/aux_tests/test_audio_processor.py
@@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 
 os.makedirs(OUT_PATH, exist_ok=True)
-conf = BaseAudioConfig(mel_fmax=8000)
+conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)
 
 
 # pylint: disable=protected-access
diff --git a/tests/aux_tests/test_numpy_transforms.py b/tests/aux_tests/test_numpy_transforms.py
index 0c1836b9..00597a0f 100644
--- a/tests/aux_tests/test_numpy_transforms.py
+++ b/tests/aux_tests/test_numpy_transforms.py
@@ -31,7 +31,8 @@ class TestNumpyTransforms(unittest.TestCase):
             mel_fmin: int = 0
             hop_length: int = 256
             win_length: int = 1024
-            pitch_fmax: int = 450
+            pitch_fmax: int = 640
+            pitch_fmin: int = 1
             trim_db: int = -1
             min_silence_sec: float = 0.01
             gain: float = 1.0