Use torchaudio for ResNet speaker encoder

This commit is contained in:
Eren Gölge 2021-12-13 16:23:57 +00:00
parent 84bbe02839
commit 4b06e3e232
1 changed files with 19 additions and 21 deletions

View File

@ -5,12 +5,10 @@ from torch import nn
from TTS.utils.audio import TorchSTFT from TTS.utils.audio import TorchSTFT
from TTS.utils.io import load_fsspec from TTS.utils.io import load_fsspec
# import torchaudio import torchaudio
class PreEmphasis(nn.Module):
class PreEmphasis(torch.nn.Module):
def __init__(self, coefficient=0.97): def __init__(self, coefficient=0.97):
super().__init__() super().__init__()
self.coefficient = coefficient self.coefficient = coefficient
@ -114,29 +112,29 @@ class ResNetSpeakerEncoder(nn.Module):
if self.use_torch_spec: if self.use_torch_spec:
self.torch_spec = torch.nn.Sequential( self.torch_spec = torch.nn.Sequential(
PreEmphasis(audio_config["preemphasis"]), PreEmphasis(audio_config["preemphasis"]),
TorchSTFT( # TorchSTFT(
n_fft=audio_config["fft_size"], # n_fft=audio_config["fft_size"],
hop_length=audio_config["hop_length"], # hop_length=audio_config["hop_length"],
win_length=audio_config["win_length"], # win_length=audio_config["win_length"],
sample_rate=audio_config["sample_rate"], # sample_rate=audio_config["sample_rate"],
window="hamming_window", # window="hamming_window",
mel_fmin=0.0, # mel_fmin=0.0,
mel_fmax=None, # mel_fmax=None,
use_htk=True, # use_htk=True,
do_amp_to_db=False, # do_amp_to_db=False,
n_mels=audio_config["num_mels"], # n_mels=audio_config["num_mels"],
power=2.0, # power=2.0,
use_mel=True, # use_mel=True,
mel_norm=None, # mel_norm=None,
), # )
"""torchaudio.transforms.MelSpectrogram( torchaudio.transforms.MelSpectrogram(
sample_rate=audio_config["sample_rate"], sample_rate=audio_config["sample_rate"],
n_fft=audio_config["fft_size"], n_fft=audio_config["fft_size"],
win_length=audio_config["win_length"], win_length=audio_config["win_length"],
hop_length=audio_config["hop_length"], hop_length=audio_config["hop_length"],
window_fn=torch.hamming_window, window_fn=torch.hamming_window,
n_mels=audio_config["num_mels"], n_mels=audio_config["num_mels"],
),""", )
) )
else: else:
self.torch_spec = None self.torch_spec = None