mirror of https://github.com/coqui-ai/TTS.git
Update import statements
This commit is contained in:
parent
9f8d86b716
commit
5f9d559419
|
@ -1,7 +1,7 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
# from TTS.utils.audio import TorchSTFT
|
||||
# from TTS.utils.audio.torch_transforms import TorchSTFT
|
||||
from TTS.encoder.models.base_encoder import BaseEncoder
|
||||
|
||||
|
||||
|
|
|
@ -155,10 +155,10 @@ def spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax):
|
|||
def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False):
|
||||
"""
|
||||
Args Shapes:
|
||||
- y : :math:`[B, 1, T]`
|
||||
- y : :math:`[B, 1, T_y]`
|
||||
|
||||
Return Shapes:
|
||||
- spec : :math:`[B,C,T]`
|
||||
- spec : :math:`[B,C,T_spec]`
|
||||
"""
|
||||
y = y.squeeze(1)
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
from TTS.utils.audio.processor import AudioProcessor
|
|
@ -1,177 +1,9 @@
|
|||
from typing import Dict, Tuple
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import pyworld as pw
|
||||
import scipy.io.wavfile
|
||||
import scipy.signal
|
||||
import soundfile as sf
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from TTS.tts.utils.helpers import StandardScaler
|
||||
|
||||
|
||||
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
||||
"""Some of the audio processing funtions using Torch for faster batch processing.
|
||||
|
||||
TODO: Merge this with audio.py
|
||||
|
||||
Args:
|
||||
|
||||
n_fft (int):
|
||||
FFT window size for STFT.
|
||||
|
||||
hop_length (int):
|
||||
number of frames between STFT columns.
|
||||
|
||||
win_length (int, optional):
|
||||
STFT window length.
|
||||
|
||||
pad_wav (bool, optional):
|
||||
If True pad the audio with (n_fft - hop_length) / 2). Defaults to False.
|
||||
|
||||
window (str, optional):
|
||||
The name of a function to create a window tensor that is applied/multiplied to each frame/window. Defaults to "hann_window"
|
||||
|
||||
sample_rate (int, optional):
|
||||
target audio sampling rate. Defaults to None.
|
||||
|
||||
mel_fmin (int, optional):
|
||||
minimum filter frequency for computing melspectrograms. Defaults to None.
|
||||
|
||||
mel_fmax (int, optional):
|
||||
maximum filter frequency for computing melspectrograms. Defaults to None.
|
||||
|
||||
n_mels (int, optional):
|
||||
number of melspectrogram dimensions. Defaults to None.
|
||||
|
||||
use_mel (bool, optional):
|
||||
If True compute the melspectrograms otherwise. Defaults to False.
|
||||
|
||||
do_amp_to_db_linear (bool, optional):
|
||||
enable/disable amplitude to dB conversion of linear spectrograms. Defaults to False.
|
||||
|
||||
spec_gain (float, optional):
|
||||
gain applied when converting amplitude to DB. Defaults to 1.0.
|
||||
|
||||
power (float, optional):
|
||||
Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc. Defaults to None.
|
||||
|
||||
use_htk (bool, optional):
|
||||
Use HTK formula in mel filter instead of Slaney.
|
||||
|
||||
mel_norm (None, 'slaney', or number, optional):
|
||||
If 'slaney', divide the triangular mel weights by the width of the mel band
|
||||
(area normalization).
|
||||
|
||||
If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm.
|
||||
See `librosa.util.normalize` for a full description of supported norm values
|
||||
(including `+-np.inf`).
|
||||
|
||||
Otherwise, leave all the triangles aiming for a peak value of 1.0. Defaults to "slaney".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
n_fft,
|
||||
hop_length,
|
||||
win_length,
|
||||
pad_wav=False,
|
||||
window="hann_window",
|
||||
sample_rate=None,
|
||||
mel_fmin=0,
|
||||
mel_fmax=None,
|
||||
n_mels=80,
|
||||
use_mel=False,
|
||||
do_amp_to_db=False,
|
||||
spec_gain=1.0,
|
||||
power=None,
|
||||
use_htk=False,
|
||||
mel_norm="slaney",
|
||||
):
|
||||
super().__init__()
|
||||
self.n_fft = n_fft
|
||||
self.hop_length = hop_length
|
||||
self.win_length = win_length
|
||||
self.pad_wav = pad_wav
|
||||
self.sample_rate = sample_rate
|
||||
self.mel_fmin = mel_fmin
|
||||
self.mel_fmax = mel_fmax
|
||||
self.n_mels = n_mels
|
||||
self.use_mel = use_mel
|
||||
self.do_amp_to_db = do_amp_to_db
|
||||
self.spec_gain = spec_gain
|
||||
self.power = power
|
||||
self.use_htk = use_htk
|
||||
self.mel_norm = mel_norm
|
||||
self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False)
|
||||
self.mel_basis = None
|
||||
if use_mel:
|
||||
self._build_mel_basis()
|
||||
|
||||
def __call__(self, x):
|
||||
"""Compute spectrogram frames by torch based stft.
|
||||
|
||||
Args:
|
||||
x (Tensor): input waveform
|
||||
|
||||
Returns:
|
||||
Tensor: spectrogram frames.
|
||||
|
||||
Shapes:
|
||||
x: [B x T] or [:math:`[B, 1, T]`]
|
||||
"""
|
||||
if x.ndim == 2:
|
||||
x = x.unsqueeze(1)
|
||||
if self.pad_wav:
|
||||
padding = int((self.n_fft - self.hop_length) / 2)
|
||||
x = torch.nn.functional.pad(x, (padding, padding), mode="reflect")
|
||||
# B x D x T x 2
|
||||
o = torch.stft(
|
||||
x.squeeze(1),
|
||||
self.n_fft,
|
||||
self.hop_length,
|
||||
self.win_length,
|
||||
self.window,
|
||||
center=True,
|
||||
pad_mode="reflect", # compatible with audio.py
|
||||
normalized=False,
|
||||
onesided=True,
|
||||
return_complex=False,
|
||||
)
|
||||
M = o[:, :, :, 0]
|
||||
P = o[:, :, :, 1]
|
||||
S = torch.sqrt(torch.clamp(M**2 + P**2, min=1e-8))
|
||||
|
||||
if self.power is not None:
|
||||
S = S**self.power
|
||||
|
||||
if self.use_mel:
|
||||
S = torch.matmul(self.mel_basis.to(x), S)
|
||||
if self.do_amp_to_db:
|
||||
S = self._amp_to_db(S, spec_gain=self.spec_gain)
|
||||
return S
|
||||
|
||||
def _build_mel_basis(self):
|
||||
mel_basis = librosa.filters.mel(
|
||||
self.sample_rate,
|
||||
self.n_fft,
|
||||
n_mels=self.n_mels,
|
||||
fmin=self.mel_fmin,
|
||||
fmax=self.mel_fmax,
|
||||
htk=self.use_htk,
|
||||
norm=self.mel_norm,
|
||||
)
|
||||
self.mel_basis = torch.from_numpy(mel_basis).float()
|
||||
|
||||
@staticmethod
|
||||
def _amp_to_db(x, spec_gain=1.0):
|
||||
return torch.log(torch.clamp(x, min=1e-5) * spec_gain)
|
||||
|
||||
@staticmethod
|
||||
def _db_to_amp(x, spec_gain=1.0):
|
||||
return torch.exp(x) / spec_gain
|
||||
|
||||
|
||||
# pylint: disable=too-many-public-methods
|
||||
|
@ -398,158 +230,6 @@ class AudioProcessor(object):
|
|||
return AudioProcessor(verbose=verbose, **config)
|
||||
|
||||
### setting up the parameters ###
|
||||
def _build_mel_basis(
|
||||
self,
|
||||
) -> np.ndarray:
|
||||
"""Build melspectrogram basis.
|
||||
|
||||
Returns:
|
||||
np.ndarray: melspectrogram basis.
|
||||
"""
|
||||
if self.mel_fmax is not None:
|
||||
assert self.mel_fmax <= self.sample_rate // 2
|
||||
return librosa.filters.mel(
|
||||
self.sample_rate, self.fft_size, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax
|
||||
)
|
||||
|
||||
def _stft_parameters(
|
||||
self,
|
||||
) -> Tuple[int, int]:
|
||||
"""Compute the real STFT parameters from the time values.
|
||||
|
||||
Returns:
|
||||
Tuple[int, int]: hop length and window length for STFT.
|
||||
"""
|
||||
factor = self.frame_length_ms / self.frame_shift_ms
|
||||
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
|
||||
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
|
||||
win_length = int(hop_length * factor)
|
||||
return hop_length, win_length
|
||||
|
||||
### normalization ###
|
||||
def normalize(self, S: np.ndarray) -> np.ndarray:
|
||||
"""Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]`
|
||||
|
||||
Args:
|
||||
S (np.ndarray): Spectrogram to normalize.
|
||||
|
||||
Raises:
|
||||
RuntimeError: Mean and variance is computed from incompatible parameters.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Normalized spectrogram.
|
||||
"""
|
||||
# pylint: disable=no-else-return
|
||||
S = S.copy()
|
||||
if self.signal_norm:
|
||||
# mean-var scaling
|
||||
if hasattr(self, "mel_scaler"):
|
||||
if S.shape[0] == self.num_mels:
|
||||
return self.mel_scaler.transform(S.T).T
|
||||
elif S.shape[0] == self.fft_size / 2:
|
||||
return self.linear_scaler.transform(S.T).T
|
||||
else:
|
||||
raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.")
|
||||
# range normalization
|
||||
S -= self.ref_level_db # discard certain range of DB assuming it is air noise
|
||||
S_norm = (S - self.min_level_db) / (-self.min_level_db)
|
||||
if self.symmetric_norm:
|
||||
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(
|
||||
S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type
|
||||
)
|
||||
return S_norm
|
||||
else:
|
||||
S_norm = self.max_norm * S_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, 0, self.max_norm)
|
||||
return S_norm
|
||||
else:
|
||||
return S
|
||||
|
||||
def denormalize(self, S: np.ndarray) -> np.ndarray:
|
||||
"""Denormalize spectrogram values.
|
||||
|
||||
Args:
|
||||
S (np.ndarray): Spectrogram to denormalize.
|
||||
|
||||
Raises:
|
||||
RuntimeError: Mean and variance are incompatible.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Denormalized spectrogram.
|
||||
"""
|
||||
# pylint: disable=no-else-return
|
||||
S_denorm = S.copy()
|
||||
if self.signal_norm:
|
||||
# mean-var scaling
|
||||
if hasattr(self, "mel_scaler"):
|
||||
if S_denorm.shape[0] == self.num_mels:
|
||||
return self.mel_scaler.inverse_transform(S_denorm.T).T
|
||||
elif S_denorm.shape[0] == self.fft_size / 2:
|
||||
return self.linear_scaler.inverse_transform(S_denorm.T).T
|
||||
else:
|
||||
raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.")
|
||||
if self.symmetric_norm:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(
|
||||
S_denorm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type
|
||||
)
|
||||
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
|
||||
return S_denorm + self.ref_level_db
|
||||
else:
|
||||
if self.clip_norm:
|
||||
S_denorm = np.clip(S_denorm, 0, self.max_norm)
|
||||
S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db
|
||||
return S_denorm + self.ref_level_db
|
||||
else:
|
||||
return S_denorm
|
||||
|
||||
### Mean-STD scaling ###
|
||||
def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]:
|
||||
"""Loading mean and variance statistics from a `npy` file.
|
||||
|
||||
Args:
|
||||
stats_path (str): Path to the `npy` file containing
|
||||
|
||||
Returns:
|
||||
Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to
|
||||
compute them.
|
||||
"""
|
||||
stats = np.load(stats_path, allow_pickle=True).item() # pylint: disable=unexpected-keyword-arg
|
||||
mel_mean = stats["mel_mean"]
|
||||
mel_std = stats["mel_std"]
|
||||
linear_mean = stats["linear_mean"]
|
||||
linear_std = stats["linear_std"]
|
||||
stats_config = stats["audio_config"]
|
||||
# check all audio parameters used for computing stats
|
||||
skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"]
|
||||
for key in stats_config.keys():
|
||||
if key in skip_parameters:
|
||||
continue
|
||||
if key not in ["sample_rate", "trim_db"]:
|
||||
assert (
|
||||
stats_config[key] == self.__dict__[key]
|
||||
), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}"
|
||||
return mel_mean, mel_std, linear_mean, linear_std, stats_config
|
||||
|
||||
# pylint: disable=attribute-defined-outside-init
|
||||
def setup_scaler(
|
||||
self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray
|
||||
) -> None:
|
||||
"""Initialize scaler objects used in mean-std normalization.
|
||||
|
||||
Args:
|
||||
mel_mean (np.ndarray): Mean for melspectrograms.
|
||||
mel_std (np.ndarray): STD for melspectrograms.
|
||||
linear_mean (np.ndarray): Mean for full scale spectrograms.
|
||||
linear_std (np.ndarray): STD for full scale spectrograms.
|
||||
"""
|
||||
self.mel_scaler = StandardScaler()
|
||||
self.mel_scaler.set_stats(mel_mean, mel_std)
|
||||
self.linear_scaler = StandardScaler()
|
||||
self.linear_scaler.set_stats(linear_mean, linear_std)
|
||||
|
||||
### DB and AMP conversion ###
|
||||
# pylint: disable=no-self-use
|
||||
|
@ -737,8 +417,7 @@ class AudioProcessor(object):
|
|||
Examples:
|
||||
>>> WAV_FILE = filename = librosa.util.example_audio_file()
|
||||
>>> from TTS.config import BaseAudioConfig
|
||||
>>> from TTS.utils.audio import AudioProcessor
|
||||
>>> conf = BaseAudioConfig(pitch_fmax=8000)
|
||||
>>> from TTS.utils.audio.processor import AudioProcessor >>> conf = BaseAudioConfig(pitch_fmax=8000)
|
||||
>>> ap = AudioProcessor(**conf)
|
||||
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
|
||||
>>> pitch = ap.compute_f0(wav)
|
||||
|
@ -913,15 +592,3 @@ class AudioProcessor(object):
|
|||
def dequantize(x, bits):
|
||||
"""Dequantize a waveform from the given number of bits."""
|
||||
return 2 * x / (2**bits - 1) - 1
|
||||
|
||||
|
||||
def _log(x, base):
|
||||
if base == 10:
|
||||
return np.log10(x)
|
||||
return np.log(x)
|
||||
|
||||
|
||||
def _exp(x, base):
|
||||
if base == 10:
|
||||
return np.power(10, x)
|
||||
return np.exp(x)
|
|
@ -11,7 +11,7 @@ from TTS.tts.models import setup_model as setup_tts_model
|
|||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
from TTS.vocoder.models import setup_model as setup_vocoder_model
|
||||
from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ import torch
|
|||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from TTS.utils.audio import TorchSTFT
|
||||
from TTS.utils.audio.torch_transforms import TorchSTFT
|
||||
from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss
|
||||
|
||||
#################################
|
||||
|
|
|
@ -3,7 +3,7 @@ import torch.nn.functional as F
|
|||
from torch import nn
|
||||
from torch.nn.utils import spectral_norm, weight_norm
|
||||
|
||||
from TTS.utils.audio import TorchSTFT
|
||||
from TTS.utils.audio.torch_transforms import TorchSTFT
|
||||
from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
|
|
@ -7,7 +7,7 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
|||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.align_tts import AlignTTS
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ from TTS.tts.configs.fast_pitch_config import FastPitchConfig
|
|||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.forward_tts import ForwardTTS
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
from TTS.utils.manage import ModelManager
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
|
|
@ -7,7 +7,7 @@ from TTS.tts.configs.fast_speech_config import FastSpeechConfig
|
|||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.forward_tts import ForwardTTS
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
from TTS.utils.manage import ModelManager
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
|
|
@ -12,7 +12,7 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
|||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.glow_tts import GlowTTS
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
# we use the same path as this script as our training folder.
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
|
|
@ -2,7 +2,7 @@ import os
|
|||
|
||||
from trainer import Trainer, TrainerArgs
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
from TTS.vocoder.configs import HifiganConfig
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.vocoder.models.gan import GAN
|
||||
|
|
|
@ -2,7 +2,7 @@ import os
|
|||
|
||||
from trainer import Trainer, TrainerArgs
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
from TTS.vocoder.configs import MultibandMelganConfig
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.vocoder.models.gan import GAN
|
||||
|
|
|
@ -7,7 +7,7 @@ from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
|
|||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.forward_tts import ForwardTTS
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_config = BaseDatasetConfig(
|
||||
|
|
|
@ -8,7 +8,7 @@ from TTS.tts.configs.tacotron2_config import Tacotron2Config
|
|||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.tacotron2 import Tacotron2
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
# from TTS.tts.datasets.tokenizer import Tokenizer
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ from TTS.tts.configs.tacotron2_config import Tacotron2Config
|
|||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.tacotron2 import Tacotron2
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
# from TTS.tts.datasets.tokenizer import Tokenizer
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ import os
|
|||
|
||||
from trainer import Trainer, TrainerArgs
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
from TTS.vocoder.configs import UnivnetConfig
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.vocoder.models.gan import GAN
|
||||
|
|
|
@ -8,7 +8,7 @@ from TTS.tts.configs.vits_config import VitsConfig
|
|||
from TTS.tts.datasets import load_tts_samples
|
||||
from TTS.tts.models.vits import Vits
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_config = BaseDatasetConfig(
|
||||
|
@ -37,7 +37,7 @@ config = VitsConfig(
|
|||
batch_size=32,
|
||||
eval_batch_size=16,
|
||||
batch_group_size=5,
|
||||
num_loader_workers=0,
|
||||
num_loader_workers=8,
|
||||
num_eval_loader_workers=4,
|
||||
run_eval=True,
|
||||
test_delay_epochs=-1,
|
||||
|
|
|
@ -2,7 +2,7 @@ import os
|
|||
|
||||
from trainer import Trainer, TrainerArgs
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
from TTS.vocoder.configs import WavegradConfig
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.vocoder.models.wavegrad import Wavegrad
|
||||
|
|
|
@ -2,7 +2,7 @@ import os
|
|||
|
||||
from trainer import Trainer, TrainerArgs
|
||||
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
from TTS.vocoder.configs import WavernnConfig
|
||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||
from TTS.vocoder.models.wavernn import Wavernn
|
||||
|
|
|
@ -11,7 +11,7 @@ from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs
|
|||
from TTS.tts.utils.languages import LanguageManager
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ from TTS.tts.datasets import load_tts_samples
|
|||
from TTS.tts.models.forward_tts import ForwardTTS
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||
|
|
|
@ -8,7 +8,7 @@ from TTS.tts.datasets import load_tts_samples
|
|||
from TTS.tts.models.forward_tts import ForwardTTS
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||
|
|
|
@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples
|
|||
from TTS.tts.models.glow_tts import GlowTTS
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
# set experiment paths
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
|
|
@ -8,7 +8,7 @@ from TTS.tts.datasets import load_tts_samples
|
|||
from TTS.tts.models.forward_tts import ForwardTTS
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||
|
|
|
@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples
|
|||
from TTS.tts.models.tacotron import Tacotron
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||
|
|
|
@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples
|
|||
from TTS.tts.models.tacotron2 import Tacotron2
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||
|
|
|
@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples
|
|||
from TTS.tts.models.tacotron2 import Tacotron2
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||
|
|
|
@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples
|
|||
from TTS.tts.models.vits import Vits, VitsArgs
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.audio.processor import AudioProcessor
|
||||
|
||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||
dataset_config = BaseDatasetConfig(
|
||||
|
|
Loading…
Reference in New Issue