mirror of https://github.com/coqui-ai/TTS.git
Use pyworld for pitch
This commit is contained in:
parent
c1513ec4cd
commit
2c4bbbf9b9
|
@ -2,6 +2,7 @@ from typing import Dict, Tuple
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pyworld as pw
|
||||||
import scipy.io.wavfile
|
import scipy.io.wavfile
|
||||||
import scipy.signal
|
import scipy.signal
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
@ -9,7 +10,6 @@ import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from TTS.tts.utils.data import StandardScaler
|
from TTS.tts.utils.data import StandardScaler
|
||||||
from TTS.utils.yin import compute_yin
|
|
||||||
|
|
||||||
|
|
||||||
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
||||||
|
@ -640,59 +640,28 @@ class AudioProcessor(object):
|
||||||
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
|
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
|
||||||
>>> pitch = ap.compute_f0(wav)
|
>>> pitch = ap.compute_f0(wav)
|
||||||
"""
|
"""
|
||||||
# f0, t = pw.dio(
|
f0, t = pw.dio(
|
||||||
# x.astype(np.double),
|
x.astype(np.double),
|
||||||
# fs=self.sample_rate,
|
fs=self.sample_rate,
|
||||||
# f0_ceil=self.mel_fmax,
|
f0_ceil=self.mel_fmax,
|
||||||
# frame_period=1000 * self.hop_length / self.sample_rate,
|
frame_period=1000 * self.hop_length / self.sample_rate,
|
||||||
# )
|
)
|
||||||
# f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
||||||
# f0, _, _, _ = compute_yin(
|
|
||||||
# x,
|
|
||||||
# self.sample_rate,
|
|
||||||
# self.win_length,
|
|
||||||
# self.hop_length,
|
|
||||||
# 65 if self.mel_fmin == 0 else self.mel_fmin,
|
|
||||||
# self.mel_fmax,
|
|
||||||
# )
|
|
||||||
# # import pyworld as pw
|
|
||||||
# # f0, _ = pw.dio(x.astype(np.float64), self.sample_rate,
|
|
||||||
# # frame_period=self.hop_length / self.sample_rate * 1000)
|
|
||||||
# pad = int((self.win_length / self.hop_length) / 2)
|
# pad = int((self.win_length / self.hop_length) / 2)
|
||||||
# f0 = [0.0] * pad + f0 + [0.0] * pad
|
# f0 = [0.0] * pad + f0 + [0.0] * pad
|
||||||
|
# f0 = np.pad(f0, (pad, pad), mode="constant", constant_values=0)
|
||||||
# f0 = np.array(f0, dtype=np.float32)
|
# f0 = np.array(f0, dtype=np.float32)
|
||||||
|
|
||||||
f0, _, _ = librosa.pyin(
|
# f01, _, _ = librosa.pyin(
|
||||||
x,
|
|
||||||
fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
|
|
||||||
fmax=self.mel_fmax,
|
|
||||||
frame_length=self.win_length,
|
|
||||||
sr=self.sample_rate,
|
|
||||||
fill_na=0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
# f02 = librosa.yin(
|
|
||||||
# x,
|
# x,
|
||||||
# fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
|
# fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
|
||||||
# fmax=self.mel_fmax,
|
# fmax=self.mel_fmax,
|
||||||
# frame_length=self.win_length,
|
# frame_length=self.win_length,
|
||||||
# sr=self.sample_rate
|
# sr=self.sample_rate,
|
||||||
|
# fill_na=0.0,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
# spec = self.melspectrogram(x)
|
# spec = self.melspectrogram(x)
|
||||||
|
|
||||||
# from matplotlib import pyplot as plt
|
|
||||||
# plt.figure()
|
|
||||||
# plt.plot(f0, linewidth=2.5, color='red')
|
|
||||||
# plt.plot(f01, linewidth=2.5, linestyle='-.')
|
|
||||||
# plt.plot(f02, linewidth=2.5)
|
|
||||||
# plt.xlabel('time')
|
|
||||||
# plt.ylabel('F0')
|
|
||||||
# plt.savefig('save_img.png')
|
|
||||||
|
|
||||||
# # plt.figure()
|
|
||||||
# plt.imshow(spec, aspect="auto", origin="lower")
|
|
||||||
# plt.savefig('save_img2.png')
|
|
||||||
return f0
|
return f0
|
||||||
|
|
||||||
### Audio Processing ###
|
### Audio Processing ###
|
||||||
|
|
118
TTS/utils/yin.py
118
TTS/utils/yin.py
|
@ -1,118 +0,0 @@
|
||||||
# adapted from https://github.com/patriceguyot/Yin
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def differenceFunction(x, N, tau_max):
|
|
||||||
"""
|
|
||||||
Compute difference function of data x. This corresponds to equation (6) in [1]
|
|
||||||
This solution is implemented directly with Numpy fft.
|
|
||||||
|
|
||||||
|
|
||||||
:param x: audio data
|
|
||||||
:param N: length of data
|
|
||||||
:param tau_max: integration window size
|
|
||||||
:return: difference function
|
|
||||||
:rtype: list
|
|
||||||
"""
|
|
||||||
|
|
||||||
x = np.array(x, np.float64)
|
|
||||||
w = x.size
|
|
||||||
tau_max = min(tau_max, w)
|
|
||||||
x_cumsum = np.concatenate((np.array([0.0]), (x * x).cumsum()))
|
|
||||||
size = w + tau_max
|
|
||||||
p2 = (size // 32).bit_length()
|
|
||||||
nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
|
|
||||||
size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
|
|
||||||
fc = np.fft.rfft(x, size_pad)
|
|
||||||
conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
|
|
||||||
return x_cumsum[w : w - tau_max : -1] + x_cumsum[w] - x_cumsum[:tau_max] - 2 * conv
|
|
||||||
|
|
||||||
|
|
||||||
def cumulativeMeanNormalizedDifferenceFunction(df, N):
|
|
||||||
"""
|
|
||||||
Compute cumulative mean normalized difference function (CMND).
|
|
||||||
|
|
||||||
This corresponds to equation (8) in [1]
|
|
||||||
|
|
||||||
:param df: Difference function
|
|
||||||
:param N: length of data
|
|
||||||
:return: cumulative mean normalized difference function
|
|
||||||
:rtype: list
|
|
||||||
"""
|
|
||||||
|
|
||||||
cmndf = df[1:] * range(1, N) / np.cumsum(df[1:]).astype(float) # scipy method
|
|
||||||
return np.insert(cmndf, 0, 1)
|
|
||||||
|
|
||||||
|
|
||||||
def getPitch(cmdf, tau_min, tau_max, harmo_th=0.1):
|
|
||||||
"""
|
|
||||||
Return fundamental period of a frame based on CMND function.
|
|
||||||
|
|
||||||
:param cmdf: Cumulative Mean Normalized Difference function
|
|
||||||
:param tau_min: minimum period for speech
|
|
||||||
:param tau_max: maximum period for speech
|
|
||||||
:param harmo_th: harmonicity threshold to determine if it is necessary to compute pitch frequency
|
|
||||||
:return: fundamental period if there is values under threshold, 0 otherwise
|
|
||||||
:rtype: float
|
|
||||||
"""
|
|
||||||
tau = tau_min
|
|
||||||
while tau < tau_max:
|
|
||||||
if cmdf[tau] < harmo_th:
|
|
||||||
while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]:
|
|
||||||
tau += 1
|
|
||||||
return tau
|
|
||||||
tau += 1
|
|
||||||
|
|
||||||
return 0 # if unvoiced
|
|
||||||
|
|
||||||
|
|
||||||
def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, harmo_thresh=0.1):
|
|
||||||
"""
|
|
||||||
|
|
||||||
Compute the Yin Algorithm. Return fundamental frequency and harmonic rate.
|
|
||||||
|
|
||||||
:param sig: Audio signal (list of float)
|
|
||||||
:param sr: sampling rate (int)
|
|
||||||
:param w_len: size of the analysis window (samples)
|
|
||||||
:param w_step: size of the lag between two consecutives windows (samples)
|
|
||||||
:param f0_min: Minimum fundamental frequency that can be detected (hertz)
|
|
||||||
:param f0_max: Maximum fundamental frequency that can be detected (hertz)
|
|
||||||
:param harmo_tresh: Threshold of detection. The yalgorithmù return the first minimum of the CMND function below this treshold.
|
|
||||||
|
|
||||||
:returns:
|
|
||||||
|
|
||||||
* pitches: list of fundamental frequencies,
|
|
||||||
* harmonic_rates: list of harmonic rate values for each fundamental frequency value (= confidence value)
|
|
||||||
* argmins: minimums of the Cumulative Mean Normalized DifferenceFunction
|
|
||||||
* times: list of time of each estimation
|
|
||||||
:rtype: tuple
|
|
||||||
"""
|
|
||||||
|
|
||||||
tau_min = int(sr / f0_max)
|
|
||||||
tau_max = int(sr / f0_min)
|
|
||||||
|
|
||||||
timeScale = range(0, len(sig) - w_len, w_step) # time values for each analysis window
|
|
||||||
times = [t / float(sr) for t in timeScale]
|
|
||||||
frames = [sig[t : t + w_len] for t in timeScale]
|
|
||||||
|
|
||||||
pitches = [0.0] * len(timeScale)
|
|
||||||
harmonic_rates = [0.0] * len(timeScale)
|
|
||||||
argmins = [0.0] * len(timeScale)
|
|
||||||
|
|
||||||
for i, frame in enumerate(frames):
|
|
||||||
# Compute YIN
|
|
||||||
df = differenceFunction(frame, w_len, tau_max)
|
|
||||||
cmdf = cumulativeMeanNormalizedDifferenceFunction(df, tau_max)
|
|
||||||
p = getPitch(cmdf, tau_min, tau_max, harmo_thresh)
|
|
||||||
|
|
||||||
# Get results
|
|
||||||
if np.argmin(cmdf) > tau_min:
|
|
||||||
argmins[i] = float(sr / np.argmin(cmdf))
|
|
||||||
if p != 0: # A pitch was found
|
|
||||||
pitches[i] = float(sr / p)
|
|
||||||
harmonic_rates[i] = cmdf[p]
|
|
||||||
else: # No pitch, but we compute a value of the harmonic rate
|
|
||||||
harmonic_rates[i] = min(cmdf)
|
|
||||||
|
|
||||||
return pitches, harmonic_rates, argmins, times
|
|
|
@ -11,7 +11,6 @@ def to_camel(text):
|
||||||
|
|
||||||
def setup_model(config: Coqpit):
|
def setup_model(config: Coqpit):
|
||||||
"""Load models directly from configuration."""
|
"""Load models directly from configuration."""
|
||||||
print(" > Vocoder Model: {}".format(config.model))
|
|
||||||
if "discriminator_model" in config and "generator_model" in config:
|
if "discriminator_model" in config and "generator_model" in config:
|
||||||
MyModel = importlib.import_module("TTS.vocoder.models.gan")
|
MyModel = importlib.import_module("TTS.vocoder.models.gan")
|
||||||
MyModel = getattr(MyModel, "GAN")
|
MyModel = getattr(MyModel, "GAN")
|
||||||
|
@ -28,6 +27,7 @@ def setup_model(config: Coqpit):
|
||||||
MyModel = getattr(MyModel, to_camel(config.model))
|
MyModel = getattr(MyModel, to_camel(config.model))
|
||||||
except ModuleNotFoundError as e:
|
except ModuleNotFoundError as e:
|
||||||
raise ValueError(f"Model {config.model} not exist!") from e
|
raise ValueError(f"Model {config.model} not exist!") from e
|
||||||
|
print(" > Vocoder Model: {}".format(config.model))
|
||||||
model = MyModel(config)
|
model = MyModel(config)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
@ -43,6 +43,7 @@ config = FastPitchConfig(
|
||||||
epochs=1000,
|
epochs=1000,
|
||||||
text_cleaner="english_cleaners",
|
text_cleaner="english_cleaners",
|
||||||
use_phonemes=True,
|
use_phonemes=True,
|
||||||
|
use_espeak_phonemes=False,
|
||||||
phoneme_language="en-us",
|
phoneme_language="en-us",
|
||||||
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
|
||||||
print_step=50,
|
print_step=50,
|
||||||
|
|
|
@ -25,3 +25,4 @@ unidic-lite==1.0.8
|
||||||
# gruut+supported langs
|
# gruut+supported langs
|
||||||
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
|
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
|
||||||
fsspec>=2021.04.0
|
fsspec>=2021.04.0
|
||||||
|
pyworld
|
|
@ -7,7 +7,10 @@ from TTS.utils.generic_utils import get_cuda
|
||||||
def get_device_id():
|
def get_device_id():
|
||||||
use_cuda, _ = get_cuda()
|
use_cuda, _ = get_cuda()
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
GPU_ID = "0"
|
if 'CUDA_VISIBLE_DEVICES' in os.environ and os.environ['CUDA_VISIBLE_DEVICES'] != "":
|
||||||
|
GPU_ID = os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]
|
||||||
|
else:
|
||||||
|
GPU_ID = "0"
|
||||||
else:
|
else:
|
||||||
GPU_ID = ""
|
GPU_ID = ""
|
||||||
return GPU_ID
|
return GPU_ID
|
||||||
|
|
Loading…
Reference in New Issue