Compute F0 using librosa

2021-07-06 09:48:00 +02:00 · 2021-07-06 09:48:00 +02:00 · fba257104d
parent 165e5814af
commit fba257104d
3 changed files with 56 additions and 11 deletions
--- a/TTS/tts/datasets/TTSDataset.py
+++ b/TTS/tts/datasets/TTSDataset.py
@ -22,6 +22,7 @@ class TTSDataset(Dataset):
        compute_linear_spec: bool,
        ap: AudioProcessor,
        meta_data: List[List],
        compute_f0: bool = False,
        characters: Dict = None,
        custom_symbols: List = None,
        add_blank: bool = False,
@ -54,6 +55,8 @@ class TTSDataset(Dataset):
            meta_data (list): List of dataset instances.
            compute_f0 (bool): compute f0 if True. Defaults to False.
            characters (dict): `dict` of custom text characters used for converting texts to sequences.
            custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own
@ -103,6 +106,7 @@ class TTSDataset(Dataset):
        self.cleaners = text_cleaner
        self.compute_linear_spec = compute_linear_spec
        self.return_wav = return_wav
        self.compute_f0 = compute_f0
        self.min_seq_len = min_seq_len
        self.max_seq_len = max_seq_len
        self.ap = ap
@ -458,6 +462,16 @@ class TTSDataset(Dataset):
                    wav_padded[i, :, : w.shape[0]] = torch.from_numpy(w)
                wav_padded.transpose_(1, 2)
            # compute f0
            # TODO: compare perf in collate_fn vs in load_data
            pitch = None
            if self.compute_f0:
                pitch = [self.ap.compute_f0(w).astype("float32") for w in wav]
                pitch = prepare_tensor(pitch, self.outputs_per_step)
                pitch = pitch.transpose(0, 2, 1)
                assert mel.shape[1] == pitch.shape[1]
                pitch = torch.FloatTensor(pitch).contiguous()
            # collate attention alignments
            if batch[0]["attn"] is not None:
                attns = [batch[idx]["attn"].T for idx in ids_sorted_decreasing]
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -623,9 +623,24 @@ class AudioProcessor(object):
            return 0, pad
        return pad // 2, pad // 2 + pad % 2
-    ### Compute F0 ###
+    def compute_f0(self, x: np.ndarray) -> np.ndarray:
-    # TODO: pw causes some dep issues
+        """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
-    # def compute_f0(self, x):
+
        Args:
            x (np.ndarray): Waveform.
        Returns:
            np.ndarray: Pitch.
        Examples:
            >>> WAV_FILE = filename = librosa.util.example_audio_file()
            >>> from TTS.config import BaseAudioConfig
            >>> from TTS.utils.audio import AudioProcessor
            >>> conf = BaseAudioConfig(mel_fmax=8000)
            >>> ap = AudioProcessor(**conf)
            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
            >>> pitch = ap.compute_f0(wav)
        """
        # f0, t = pw.dio(
        #     x.astype(np.double),
        #     fs=self.sample_rate,
@ -633,7 +648,16 @@ class AudioProcessor(object):
        #     frame_period=1000 * self.hop_length / self.sample_rate,
        # )
        # f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
-    #     return f0
+        # f0 = compute_yin(, self.sample_rate, self.hop_length, self.fft_size)
        f0, _, _ = librosa.pyin(
            x.astype(np.double),
            fmin=65 if self.mel_fmin == 0 else self.mel_fmin,
            fmax=self.mel_fmax,
            frame_length=self.win_length,
            sr=self.sample_rate,
            fill_na=0.0,
        )
        return f0
    ### Audio Processing ###
    def find_endpoint(self, wav: np.ndarray, threshold_db=-40, min_silence_sec=0.8) -> int:
--- a/tests/test_audio_processor.py
+++ b/tests/test_audio_processor.py
@ -181,3 +181,10 @@ class TestAudio(unittest.TestCase):
        mel_norm = ap.melspectrogram(wav)
        mel_denorm = ap.denormalize(mel_norm)
        assert abs(mel_reference - mel_denorm).max() < 1e-4
    def test_compute_f0(self):
        ap = AudioProcessor(**conf)
        wav = ap.load_wav(WAV_FILE)
        pitch = ap.compute_f0(wav)
        mel = ap.melspectrogram(wav)
        assert pitch.shape[0] == mel.shape[1]