mirror of https://github.com/coqui-ai/TTS.git
Compute F0 using librosa
This commit is contained in:
parent
165e5814af
commit
fba257104d
|
@ -22,6 +22,7 @@ class TTSDataset(Dataset):
|
||||||
compute_linear_spec: bool,
|
compute_linear_spec: bool,
|
||||||
ap: AudioProcessor,
|
ap: AudioProcessor,
|
||||||
meta_data: List[List],
|
meta_data: List[List],
|
||||||
|
compute_f0: bool = False,
|
||||||
characters: Dict = None,
|
characters: Dict = None,
|
||||||
custom_symbols: List = None,
|
custom_symbols: List = None,
|
||||||
add_blank: bool = False,
|
add_blank: bool = False,
|
||||||
|
@ -54,6 +55,8 @@ class TTSDataset(Dataset):
|
||||||
|
|
||||||
meta_data (list): List of dataset instances.
|
meta_data (list): List of dataset instances.
|
||||||
|
|
||||||
|
compute_f0 (bool): compute f0 if True. Defaults to False.
|
||||||
|
|
||||||
characters (dict): `dict` of custom text characters used for converting texts to sequences.
|
characters (dict): `dict` of custom text characters used for converting texts to sequences.
|
||||||
|
|
||||||
custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own
|
custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own
|
||||||
|
@ -103,6 +106,7 @@ class TTSDataset(Dataset):
|
||||||
self.cleaners = text_cleaner
|
self.cleaners = text_cleaner
|
||||||
self.compute_linear_spec = compute_linear_spec
|
self.compute_linear_spec = compute_linear_spec
|
||||||
self.return_wav = return_wav
|
self.return_wav = return_wav
|
||||||
|
self.compute_f0 = compute_f0
|
||||||
self.min_seq_len = min_seq_len
|
self.min_seq_len = min_seq_len
|
||||||
self.max_seq_len = max_seq_len
|
self.max_seq_len = max_seq_len
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
|
@ -458,6 +462,16 @@ class TTSDataset(Dataset):
|
||||||
wav_padded[i, :, : w.shape[0]] = torch.from_numpy(w)
|
wav_padded[i, :, : w.shape[0]] = torch.from_numpy(w)
|
||||||
wav_padded.transpose_(1, 2)
|
wav_padded.transpose_(1, 2)
|
||||||
|
|
||||||
|
# compute f0
|
||||||
|
# TODO: compare perf in collate_fn vs in load_data
|
||||||
|
pitch = None
|
||||||
|
if self.compute_f0:
|
||||||
|
pitch = [self.ap.compute_f0(w).astype("float32") for w in wav]
|
||||||
|
pitch = prepare_tensor(pitch, self.outputs_per_step)
|
||||||
|
pitch = pitch.transpose(0, 2, 1)
|
||||||
|
assert mel.shape[1] == pitch.shape[1]
|
||||||
|
pitch = torch.FloatTensor(pitch).contiguous()
|
||||||
|
|
||||||
# collate attention alignments
|
# collate attention alignments
|
||||||
if batch[0]["attn"] is not None:
|
if batch[0]["attn"] is not None:
|
||||||
attns = [batch[idx]["attn"].T for idx in ids_sorted_decreasing]
|
attns = [batch[idx]["attn"].T for idx in ids_sorted_decreasing]
|
||||||
|
|
|
@ -623,9 +623,24 @@ class AudioProcessor(object):
|
||||||
return 0, pad
|
return 0, pad
|
||||||
return pad // 2, pad // 2 + pad % 2
|
return pad // 2, pad // 2 + pad % 2
|
||||||
|
|
||||||
### Compute F0 ###
|
def compute_f0(self, x: np.ndarray) -> np.ndarray:
|
||||||
# TODO: pw causes some dep issues
|
"""Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
|
||||||
# def compute_f0(self, x):
|
|
||||||
|
Args:
|
||||||
|
x (np.ndarray): Waveform.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.ndarray: Pitch.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> WAV_FILE = filename = librosa.util.example_audio_file()
|
||||||
|
>>> from TTS.config import BaseAudioConfig
|
||||||
|
>>> from TTS.utils.audio import AudioProcessor
|
||||||
|
>>> conf = BaseAudioConfig(mel_fmax=8000)
|
||||||
|
>>> ap = AudioProcessor(**conf)
|
||||||
|
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
|
||||||
|
>>> pitch = ap.compute_f0(wav)
|
||||||
|
"""
|
||||||
# f0, t = pw.dio(
|
# f0, t = pw.dio(
|
||||||
# x.astype(np.double),
|
# x.astype(np.double),
|
||||||
# fs=self.sample_rate,
|
# fs=self.sample_rate,
|
||||||
|
@ -633,7 +648,16 @@ class AudioProcessor(object):
|
||||||
# frame_period=1000 * self.hop_length / self.sample_rate,
|
# frame_period=1000 * self.hop_length / self.sample_rate,
|
||||||
# )
|
# )
|
||||||
# f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
# f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
||||||
# return f0
|
# f0 = compute_yin(, self.sample_rate, self.hop_length, self.fft_size)
|
||||||
|
f0, _, _ = librosa.pyin(
|
||||||
|
x.astype(np.double),
|
||||||
|
fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
|
||||||
|
fmax=self.mel_fmax,
|
||||||
|
frame_length=self.win_length,
|
||||||
|
sr=self.sample_rate,
|
||||||
|
fill_na=0.0,
|
||||||
|
)
|
||||||
|
return f0
|
||||||
|
|
||||||
### Audio Processing ###
|
### Audio Processing ###
|
||||||
def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:
|
def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:
|
||||||
|
|
|
@ -181,3 +181,10 @@ class TestAudio(unittest.TestCase):
|
||||||
mel_norm = ap.melspectrogram(wav)
|
mel_norm = ap.melspectrogram(wav)
|
||||||
mel_denorm = ap.denormalize(mel_norm)
|
mel_denorm = ap.denormalize(mel_norm)
|
||||||
assert abs(mel_reference - mel_denorm).max() < 1e-4
|
assert abs(mel_reference - mel_denorm).max() < 1e-4
|
||||||
|
|
||||||
|
def test_compute_f0(self):
|
||||||
|
ap = AudioProcessor(**conf)
|
||||||
|
wav = ap.load_wav(WAV_FILE)
|
||||||
|
pitch = ap.compute_f0(wav)
|
||||||
|
mel = ap.melspectrogram(wav)
|
||||||
|
assert pitch.shape[0] == mel.shape[1]
|
||||||
|
|
Loading…
Reference in New Issue