mirror of https://github.com/coqui-ai/TTS.git
119 lines
4.0 KiB
Python
119 lines
4.0 KiB
Python
# adapted from https://github.com/patriceguyot/Yin
|
|
|
|
import numpy as np
|
|
|
|
|
|
def differenceFunction(x, N, tau_max):
|
|
"""
|
|
Compute difference function of data x. This corresponds to equation (6) in [1]
|
|
This solution is implemented directly with Numpy fft.
|
|
|
|
|
|
:param x: audio data
|
|
:param N: length of data
|
|
:param tau_max: integration window size
|
|
:return: difference function
|
|
:rtype: list
|
|
"""
|
|
|
|
x = np.array(x, np.float64)
|
|
w = x.size
|
|
tau_max = min(tau_max, w)
|
|
x_cumsum = np.concatenate((np.array([0.0]), (x * x).cumsum()))
|
|
size = w + tau_max
|
|
p2 = (size // 32).bit_length()
|
|
nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
|
|
size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
|
|
fc = np.fft.rfft(x, size_pad)
|
|
conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
|
|
return x_cumsum[w : w - tau_max : -1] + x_cumsum[w] - x_cumsum[:tau_max] - 2 * conv
|
|
|
|
|
|
def cumulativeMeanNormalizedDifferenceFunction(df, N):
|
|
"""
|
|
Compute cumulative mean normalized difference function (CMND).
|
|
|
|
This corresponds to equation (8) in [1]
|
|
|
|
:param df: Difference function
|
|
:param N: length of data
|
|
:return: cumulative mean normalized difference function
|
|
:rtype: list
|
|
"""
|
|
|
|
cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) # scipy method
|
|
return np.insert(cmndf, 0, 1)
|
|
|
|
|
|
def getPitch(cmdf, tau_min, tau_max, harmo_th=0.1):
|
|
"""
|
|
Return fundamental period of a frame based on CMND function.
|
|
|
|
:param cmdf: Cumulative Mean Normalized Difference function
|
|
:param tau_min: minimum period for speech
|
|
:param tau_max: maximum period for speech
|
|
:param harmo_th: harmonicity threshold to determine if it is necessary to compute pitch frequency
|
|
:return: fundamental period if there is values under threshold, 0 otherwise
|
|
:rtype: float
|
|
"""
|
|
tau = tau_min
|
|
while tau < tau_max:
|
|
if cmdf[tau] < harmo_th:
|
|
while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]:
|
|
tau += 1
|
|
return tau
|
|
tau += 1
|
|
|
|
return 0 # if unvoiced
|
|
|
|
|
|
def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, harmo_thresh=0.1):
|
|
"""
|
|
|
|
Compute the Yin Algorithm. Return fundamental frequency and harmonic rate.
|
|
|
|
:param sig: Audio signal (list of float)
|
|
:param sr: sampling rate (int)
|
|
:param w_len: size of the analysis window (samples)
|
|
:param w_step: size of the lag between two consecutives windows (samples)
|
|
:param f0_min: Minimum fundamental frequency that can be detected (hertz)
|
|
:param f0_max: Maximum fundamental frequency that can be detected (hertz)
|
|
:param harmo_tresh: Threshold of detection. The yalgorithmù return the first minimum of the CMND function below this treshold.
|
|
|
|
:returns:
|
|
|
|
* pitches: list of fundamental frequencies,
|
|
* harmonic_rates: list of harmonic rate values for each fundamental frequency value (= confidence value)
|
|
* argmins: minimums of the Cumulative Mean Normalized DifferenceFunction
|
|
* times: list of time of each estimation
|
|
:rtype: tuple
|
|
"""
|
|
|
|
tau_min = int(sr / f0_max)
|
|
tau_max = int(sr / f0_min)
|
|
|
|
timeScale = range(0, len(sig) - w_len, w_step) # time values for each analysis window
|
|
times = [t / float(sr) for t in timeScale]
|
|
frames = [sig[t : t + w_len] for t in timeScale]
|
|
|
|
pitches = [0.0] * len(timeScale)
|
|
harmonic_rates = [0.0] * len(timeScale)
|
|
argmins = [0.0] * len(timeScale)
|
|
|
|
for i, frame in enumerate(frames):
|
|
# Compute YIN
|
|
df = differenceFunction(frame, w_len, tau_max)
|
|
cmdf = cumulativeMeanNormalizedDifferenceFunction(df, tau_max)
|
|
p = getPitch(cmdf, tau_min, tau_max, harmo_thresh)
|
|
|
|
# Get results
|
|
if np.argmin(cmdf) > tau_min:
|
|
argmins[i] = float(sr / np.argmin(cmdf))
|
|
if p != 0: # A pitch was found
|
|
pitches[i] = float(sr / p)
|
|
harmonic_rates[i] = cmdf[p]
|
|
else: # No pitch, but we compute a value of the harmonic rate
|
|
harmonic_rates[i] = min(cmdf)
|
|
|
|
return pitches, harmonic_rates, argmins, times
|