Use pyworld for pitch

2021-09-06 14:29:22 +00:00 · 2021-09-06 14:29:22 +00:00 · 2c4bbbf9b9
parent c1513ec4cd
commit 2c4bbbf9b9
6 changed files with 19 additions and 163 deletions
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -2,6 +2,7 @@ from typing import Dict, Tuple
 import librosa
 import numpy as np
 import pyworld as pw
 import scipy.io.wavfile
 import scipy.signal
 import soundfile as sf
@ -9,7 +10,6 @@ import torch
 from torch import nn
 from TTS.tts.utils.data import StandardScaler
 from TTS.utils.yin import compute_yin
 class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
@ -640,59 +640,28 @@ class AudioProcessor(object):
            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
            >>> pitch = ap.compute_f0(wav)
        """
-        # f0, t = pw.dio(
+        f0, t = pw.dio(
-        #     x.astype(np.double),
+            x.astype(np.double),
-        #     fs=self.sample_rate,
+            fs=self.sample_rate,
-        #     f0_ceil=self.mel_fmax,
+            f0_ceil=self.mel_fmax,
-        #     frame_period=1000 * self.hop_length / self.sample_rate,
+            frame_period=1000 * self.hop_length / self.sample_rate,
-        # )
+        )
-        # f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
+        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
        # f0, _, _, _ = compute_yin(
        #     x,
        #     self.sample_rate,
        #     self.win_length,
        #     self.hop_length,
        #     65 if self.mel_fmin == 0 else self.mel_fmin,
        #     self.mel_fmax,
        # )
        # # import pyworld as pw
        # # f0, _ = pw.dio(x.astype(np.float64), self.sample_rate,
        # #                   frame_period=self.hop_length / self.sample_rate * 1000)
        # pad = int((self.win_length / self.hop_length) / 2)
        # f0 = [0.0] * pad + f0 + [0.0] * pad
        # f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
        # f0 = np.array(f0, dtype=np.float32)
-        f0, _, _ = librosa.pyin(
+        # f01, _, _ = librosa.pyin(
            x,
            fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
            fmax=self.mel_fmax,
            frame_length=self.win_length,
            sr=self.sample_rate,
            fill_na=0.0,
        )
        # f02 = librosa.yin(
        #     x,
        #     fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
        #     fmax=self.mel_fmax,
        #     frame_length=self.win_length,
-        #     sr=self.sample_rate
+        #     sr=self.sample_rate,
        #     fill_na=0.0,
        # )
        # spec = self.melspectrogram(x)
        # from matplotlib import pyplot as plt
        # plt.figure()
        # plt.plot(f0, linewidth=2.5, color='red')
        # plt.plot(f01, linewidth=2.5, linestyle='-.')
        # plt.plot(f02, linewidth=2.5)
        # plt.xlabel('time')
        # plt.ylabel('F0')
        # plt.savefig('save_img.png')
        # # plt.figure()
        # plt.imshow(spec, aspect="auto", origin="lower")
        # plt.savefig('save_img2.png')
        return f0
    ### Audio Processing ###
--- a/TTS/utils/yin.py
+++ b/TTS/utils/yin.py
@ -1,118 +0,0 @@
 # adapted from https://github.com/patriceguyot/Yin
 import numpy as np
 def differenceFunction(x, N, tau_max):
    """
    Compute difference function of data x. This corresponds to equation (6) in [1]
    This solution is implemented directly with Numpy fft.
    :param x: audio data
    :param N: length of data
    :param tau_max: integration window size
    :return: difference function
    :rtype: list
    """
    x = np.array(x, np.float64)
    w = x.size
    tau_max = min(tau_max, w)
    x_cumsum = np.concatenate((np.array([0.0]), (x * x).cumsum()))
    size = w + tau_max
    p2 = (size // 32).bit_length()
    nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
    size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
    fc = np.fft.rfft(x, size_pad)
    conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
    return x_cumsum[w : w - tau_max : -1] + x_cumsum[w] - x_cumsum[:tau_max] - 2 * conv
 def cumulativeMeanNormalizedDifferenceFunction(df, N):
    """
    Compute cumulative mean normalized difference function (CMND).
    This corresponds to equation (8) in [1]
    :param df: Difference function
    :param N: length of data
    :return: cumulative mean normalized difference function
    :rtype: list
    """
    cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float)  # scipy method
    return np.insert(cmndf, 0, 1)
 def getPitch(cmdf, tau_min, tau_max, harmo_th=0.1):
    """
    Return fundamental period of a frame based on CMND function.
    :param cmdf: Cumulative Mean Normalized Difference function
    :param tau_min: minimum period for speech
    :param tau_max: maximum period for speech
    :param harmo_th: harmonicity threshold to determine if it is necessary to compute pitch frequency
    :return: fundamental period if there is values under threshold, 0 otherwise
    :rtype: float
    """
    tau = tau_min
    while tau < tau_max:
        if cmdf[tau] < harmo_th:
            while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]:
                tau += 1
            return tau
        tau += 1
    return 0  # if unvoiced
 def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, harmo_thresh=0.1):
    """
    Compute the Yin Algorithm. Return fundamental frequency and harmonic rate.
    :param sig: Audio signal (list of float)
    :param sr: sampling rate (int)
    :param w_len: size of the analysis window (samples)
    :param w_step: size of the lag between two consecutives windows (samples)
    :param f0_min: Minimum fundamental frequency that can be detected (hertz)
    :param f0_max: Maximum fundamental frequency that can be detected (hertz)
    :param harmo_tresh: Threshold of detection. The yalgorithmù return the first minimum of the CMND function below this treshold.
    :returns:
        * pitches: list of fundamental frequencies,
        * harmonic_rates: list of harmonic rate values for each fundamental frequency value (= confidence value)
        * argmins: minimums of the Cumulative Mean Normalized DifferenceFunction
        * times: list of time of each estimation
    :rtype: tuple
    """
    tau_min = int(sr / f0_max)
    tau_max = int(sr / f0_min)
    timeScale = range(0, len(sig) - w_len, w_step)  # time values for each analysis window
    times = [t / float(sr) for t in timeScale]
    frames = [sig[t : t + w_len] for t in timeScale]
    pitches = [0.0] * len(timeScale)
    harmonic_rates = [0.0] * len(timeScale)
    argmins = [0.0] * len(timeScale)
    for i, frame in enumerate(frames):
        # Compute YIN
        df = differenceFunction(frame, w_len, tau_max)
        cmdf = cumulativeMeanNormalizedDifferenceFunction(df, tau_max)
        p = getPitch(cmdf, tau_min, tau_max, harmo_thresh)
        # Get results
        if np.argmin(cmdf) > tau_min:
            argmins[i] = float(sr / np.argmin(cmdf))
        if p != 0:  # A pitch was found
            pitches[i] = float(sr / p)
            harmonic_rates[i] = cmdf[p]
        else:  # No pitch, but we compute a value of the harmonic rate
            harmonic_rates[i] = min(cmdf)
    return pitches, harmonic_rates, argmins, times
--- a/TTS/vocoder/models/init.py
+++ b/TTS/vocoder/models/init.py
@ -11,7 +11,6 @@ def to_camel(text):
 def setup_model(config: Coqpit):
    """Load models directly from configuration."""
    print(" > Vocoder Model: {}".format(config.model))
    if "discriminator_model" in config and "generator_model" in config:
        MyModel = importlib.import_module("TTS.vocoder.models.gan")
        MyModel = getattr(MyModel, "GAN")
@ -28,6 +27,7 @@ def setup_model(config: Coqpit):
                MyModel = getattr(MyModel, to_camel(config.model))
            except ModuleNotFoundError as e:
                raise ValueError(f"Model {config.model} not exist!") from e
    print(" > Vocoder Model: {}".format(config.model))
    model = MyModel(config)
    return model
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@ -43,6 +43,7 @@ config = FastPitchConfig(
    epochs=1000,
    text_cleaner="english_cleaners",
    use_phonemes=True,
    use_espeak_phonemes=False,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=50,
--- a/requirements.txt
+++ b/requirements.txt
@ -25,3 +25,4 @@ unidic-lite==1.0.8
 # gruut+supported langs
 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
 fsspec>=2021.04.0
 pyworld
--- a/tests/init.py
+++ b/tests/init.py
@ -7,7 +7,10 @@ from TTS.utils.generic_utils import get_cuda
 def get_device_id():
    use_cuda, _ = get_cuda()
    if use_cuda:
-        GPU_ID = "0"
+        if 'CUDA_VISIBLE_DEVICES' in os.environ and os.environ['CUDA_VISIBLE_DEVICES'] != "":
            GPU_ID = os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]
        else:
            GPU_ID = "0"
    else:
        GPU_ID = ""
    return GPU_ID