mirror of https://github.com/coqui-ai/TTS.git
Use torchaudio for ResNet speaker encoder
This commit is contained in:
parent
84bbe02839
commit
4b06e3e232
|
@ -5,12 +5,10 @@ from torch import nn
|
||||||
from TTS.utils.audio import TorchSTFT
|
from TTS.utils.audio import TorchSTFT
|
||||||
from TTS.utils.io import load_fsspec
|
from TTS.utils.io import load_fsspec
|
||||||
|
|
||||||
# import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
|
|
||||||
|
class PreEmphasis(nn.Module):
|
||||||
|
|
||||||
class PreEmphasis(torch.nn.Module):
|
|
||||||
def __init__(self, coefficient=0.97):
|
def __init__(self, coefficient=0.97):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.coefficient = coefficient
|
self.coefficient = coefficient
|
||||||
|
@ -114,29 +112,29 @@ class ResNetSpeakerEncoder(nn.Module):
|
||||||
if self.use_torch_spec:
|
if self.use_torch_spec:
|
||||||
self.torch_spec = torch.nn.Sequential(
|
self.torch_spec = torch.nn.Sequential(
|
||||||
PreEmphasis(audio_config["preemphasis"]),
|
PreEmphasis(audio_config["preemphasis"]),
|
||||||
TorchSTFT(
|
# TorchSTFT(
|
||||||
n_fft=audio_config["fft_size"],
|
# n_fft=audio_config["fft_size"],
|
||||||
hop_length=audio_config["hop_length"],
|
# hop_length=audio_config["hop_length"],
|
||||||
win_length=audio_config["win_length"],
|
# win_length=audio_config["win_length"],
|
||||||
sample_rate=audio_config["sample_rate"],
|
# sample_rate=audio_config["sample_rate"],
|
||||||
window="hamming_window",
|
# window="hamming_window",
|
||||||
mel_fmin=0.0,
|
# mel_fmin=0.0,
|
||||||
mel_fmax=None,
|
# mel_fmax=None,
|
||||||
use_htk=True,
|
# use_htk=True,
|
||||||
do_amp_to_db=False,
|
# do_amp_to_db=False,
|
||||||
n_mels=audio_config["num_mels"],
|
# n_mels=audio_config["num_mels"],
|
||||||
power=2.0,
|
# power=2.0,
|
||||||
use_mel=True,
|
# use_mel=True,
|
||||||
mel_norm=None,
|
# mel_norm=None,
|
||||||
),
|
# )
|
||||||
"""torchaudio.transforms.MelSpectrogram(
|
torchaudio.transforms.MelSpectrogram(
|
||||||
sample_rate=audio_config["sample_rate"],
|
sample_rate=audio_config["sample_rate"],
|
||||||
n_fft=audio_config["fft_size"],
|
n_fft=audio_config["fft_size"],
|
||||||
win_length=audio_config["win_length"],
|
win_length=audio_config["win_length"],
|
||||||
hop_length=audio_config["hop_length"],
|
hop_length=audio_config["hop_length"],
|
||||||
window_fn=torch.hamming_window,
|
window_fn=torch.hamming_window,
|
||||||
n_mels=audio_config["num_mels"],
|
n_mels=audio_config["num_mels"],
|
||||||
),""",
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.torch_spec = None
|
self.torch_spec = None
|
||||||
|
|
Loading…
Reference in New Issue