Use pyworld for pitch

This commit is contained in:
Eren Gölge 2021-09-06 14:29:22 +00:00
parent c1513ec4cd
commit 2c4bbbf9b9
6 changed files with 19 additions and 163 deletions

View File

@ -2,6 +2,7 @@ from typing import Dict, Tuple
import librosa import librosa
import numpy as np import numpy as np
import pyworld as pw
import scipy.io.wavfile import scipy.io.wavfile
import scipy.signal import scipy.signal
import soundfile as sf import soundfile as sf
@ -9,7 +10,6 @@ import torch
from torch import nn from torch import nn
from TTS.tts.utils.data import StandardScaler from TTS.tts.utils.data import StandardScaler
from TTS.utils.yin import compute_yin
class TorchSTFT(nn.Module): # pylint: disable=abstract-method class TorchSTFT(nn.Module): # pylint: disable=abstract-method
@ -640,59 +640,28 @@ class AudioProcessor(object):
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
>>> pitch = ap.compute_f0(wav) >>> pitch = ap.compute_f0(wav)
""" """
# f0, t = pw.dio( f0, t = pw.dio(
# x.astype(np.double), x.astype(np.double),
# fs=self.sample_rate, fs=self.sample_rate,
# f0_ceil=self.mel_fmax, f0_ceil=self.mel_fmax,
# frame_period=1000 * self.hop_length / self.sample_rate, frame_period=1000 * self.hop_length / self.sample_rate,
# ) )
# f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
# f0, _, _, _ = compute_yin(
# x,
# self.sample_rate,
# self.win_length,
# self.hop_length,
# 65 if self.mel_fmin == 0 else self.mel_fmin,
# self.mel_fmax,
# )
# # import pyworld as pw
# # f0, _ = pw.dio(x.astype(np.float64), self.sample_rate,
# # frame_period=self.hop_length / self.sample_rate * 1000)
# pad = int((self.win_length / self.hop_length) / 2) # pad = int((self.win_length / self.hop_length) / 2)
# f0 = [0.0] * pad + f0 + [0.0] * pad # f0 = [0.0] * pad + f0 + [0.0] * pad
# f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
# f0 = np.array(f0, dtype=np.float32) # f0 = np.array(f0, dtype=np.float32)
f0, _, _ = librosa.pyin( # f01, _, _ = librosa.pyin(
x,
fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
fmax=self.mel_fmax,
frame_length=self.win_length,
sr=self.sample_rate,
fill_na=0.0,
)
# f02 = librosa.yin(
# x, # x,
# fmin=65 if self.mel_fmin == 0 else self.mel_fmin, # fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
# fmax=self.mel_fmax, # fmax=self.mel_fmax,
# frame_length=self.win_length, # frame_length=self.win_length,
# sr=self.sample_rate # sr=self.sample_rate,
# fill_na=0.0,
# ) # )
# spec = self.melspectrogram(x) # spec = self.melspectrogram(x)
# from matplotlib import pyplot as plt
# plt.figure()
# plt.plot(f0, linewidth=2.5, color='red')
# plt.plot(f01, linewidth=2.5, linestyle='-.')
# plt.plot(f02, linewidth=2.5)
# plt.xlabel('time')
# plt.ylabel('F0')
# plt.savefig('save_img.png')
# # plt.figure()
# plt.imshow(spec, aspect="auto", origin="lower")
# plt.savefig('save_img2.png')
return f0 return f0
### Audio Processing ### ### Audio Processing ###

View File

@ -1,118 +0,0 @@
# adapted from https://github.com/patriceguyot/Yin
import numpy as np
def differenceFunction(x, N, tau_max):
"""
Compute difference function of data x. This corresponds to equation (6) in [1]
This solution is implemented directly with Numpy fft.
:param x: audio data
:param N: length of data
:param tau_max: integration window size
:return: difference function
:rtype: list
"""
x = np.array(x, np.float64)
w = x.size
tau_max = min(tau_max, w)
x_cumsum = np.concatenate((np.array([0.0]), (x * x).cumsum()))
size = w + tau_max
p2 = (size // 32).bit_length()
nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
fc = np.fft.rfft(x, size_pad)
conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
return x_cumsum[w : w - tau_max : -1] + x_cumsum[w] - x_cumsum[:tau_max] - 2 * conv
def cumulativeMeanNormalizedDifferenceFunction(df, N):
"""
Compute cumulative mean normalized difference function (CMND).
This corresponds to equation (8) in [1]
:param df: Difference function
:param N: length of data
:return: cumulative mean normalized difference function
:rtype: list
"""
cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) # scipy method
return np.insert(cmndf, 0, 1)
def getPitch(cmdf, tau_min, tau_max, harmo_th=0.1):
"""
Return fundamental period of a frame based on CMND function.
:param cmdf: Cumulative Mean Normalized Difference function
:param tau_min: minimum period for speech
:param tau_max: maximum period for speech
:param harmo_th: harmonicity threshold to determine if it is necessary to compute pitch frequency
:return: fundamental period if there is values under threshold, 0 otherwise
:rtype: float
"""
tau = tau_min
while tau < tau_max:
if cmdf[tau] < harmo_th:
while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]:
tau += 1
return tau
tau += 1
return 0 # if unvoiced
def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, harmo_thresh=0.1):
"""
Compute the Yin Algorithm. Return fundamental frequency and harmonic rate.
:param sig: Audio signal (list of float)
:param sr: sampling rate (int)
:param w_len: size of the analysis window (samples)
:param w_step: size of the lag between two consecutives windows (samples)
:param f0_min: Minimum fundamental frequency that can be detected (hertz)
:param f0_max: Maximum fundamental frequency that can be detected (hertz)
:param harmo_tresh: Threshold of detection. The yalgorithmù return the first minimum of the CMND function below this treshold.
:returns:
* pitches: list of fundamental frequencies,
* harmonic_rates: list of harmonic rate values for each fundamental frequency value (= confidence value)
* argmins: minimums of the Cumulative Mean Normalized DifferenceFunction
* times: list of time of each estimation
:rtype: tuple
"""
tau_min = int(sr / f0_max)
tau_max = int(sr / f0_min)
timeScale = range(0, len(sig) - w_len, w_step) # time values for each analysis window
times = [t / float(sr) for t in timeScale]
frames = [sig[t : t + w_len] for t in timeScale]
pitches = [0.0] * len(timeScale)
harmonic_rates = [0.0] * len(timeScale)
argmins = [0.0] * len(timeScale)
for i, frame in enumerate(frames):
# Compute YIN
df = differenceFunction(frame, w_len, tau_max)
cmdf = cumulativeMeanNormalizedDifferenceFunction(df, tau_max)
p = getPitch(cmdf, tau_min, tau_max, harmo_thresh)
# Get results
if np.argmin(cmdf) > tau_min:
argmins[i] = float(sr / np.argmin(cmdf))
if p != 0: # A pitch was found
pitches[i] = float(sr / p)
harmonic_rates[i] = cmdf[p]
else: # No pitch, but we compute a value of the harmonic rate
harmonic_rates[i] = min(cmdf)
return pitches, harmonic_rates, argmins, times

View File

@ -11,7 +11,6 @@ def to_camel(text):
def setup_model(config: Coqpit): def setup_model(config: Coqpit):
"""Load models directly from configuration.""" """Load models directly from configuration."""
print(" > Vocoder Model: {}".format(config.model))
if "discriminator_model" in config and "generator_model" in config: if "discriminator_model" in config and "generator_model" in config:
MyModel = importlib.import_module("TTS.vocoder.models.gan") MyModel = importlib.import_module("TTS.vocoder.models.gan")
MyModel = getattr(MyModel, "GAN") MyModel = getattr(MyModel, "GAN")
@ -28,6 +27,7 @@ def setup_model(config: Coqpit):
MyModel = getattr(MyModel, to_camel(config.model)) MyModel = getattr(MyModel, to_camel(config.model))
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
raise ValueError(f"Model {config.model} not exist!") from e raise ValueError(f"Model {config.model} not exist!") from e
print(" > Vocoder Model: {}".format(config.model))
model = MyModel(config) model = MyModel(config)
return model return model

View File

@ -43,6 +43,7 @@ config = FastPitchConfig(
epochs=1000, epochs=1000,
text_cleaner="english_cleaners", text_cleaner="english_cleaners",
use_phonemes=True, use_phonemes=True,
use_espeak_phonemes=False,
phoneme_language="en-us", phoneme_language="en-us",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=50, print_step=50,

View File

@ -25,3 +25,4 @@ unidic-lite==1.0.8
# gruut+supported langs # gruut+supported langs
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
fsspec>=2021.04.0 fsspec>=2021.04.0
pyworld

View File

@ -7,7 +7,10 @@ from TTS.utils.generic_utils import get_cuda
def get_device_id(): def get_device_id():
use_cuda, _ = get_cuda() use_cuda, _ = get_cuda()
if use_cuda: if use_cuda:
GPU_ID = "0" if 'CUDA_VISIBLE_DEVICES' in os.environ and os.environ['CUDA_VISIBLE_DEVICES'] != "":
GPU_ID = os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]
else:
GPU_ID = "0"
else: else:
GPU_ID = "" GPU_ID = ""
return GPU_ID return GPU_ID