From 5f9d559419d4feae9aedc57db422428884e4b5a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 19 Apr 2022 09:16:03 +0200 Subject: [PATCH] Update import statements --- TTS/encoder/models/resnet.py | 2 +- TTS/tts/models/vits.py | 4 +- TTS/utils/audio/__init__.py | 1 + TTS/utils/{audio.py => audio/processor.py} | 335 +----------------- TTS/utils/synthesizer.py | 2 +- TTS/vocoder/layers/losses.py | 2 +- TTS/vocoder/models/univnet_discriminator.py | 2 +- recipes/ljspeech/align_tts/train_aligntts.py | 2 +- .../ljspeech/fast_pitch/train_fast_pitch.py | 2 +- .../ljspeech/fast_speech/train_fast_speech.py | 2 +- recipes/ljspeech/glow_tts/train_glowtts.py | 2 +- recipes/ljspeech/hifigan/train_hifigan.py | 2 +- .../train_multiband_melgan.py | 2 +- .../speedy_speech/train_speedy_speech.py | 2 +- .../tacotron2-DCA/train_tacotron_dca.py | 2 +- .../tacotron2-DDC/train_tacotron_ddc.py | 2 +- recipes/ljspeech/univnet/train.py | 2 +- recipes/ljspeech/vits_tts/train_vits.py | 4 +- recipes/ljspeech/wavegrad/train_wavegrad.py | 2 +- recipes/ljspeech/wavernn/train_wavernn.py | 2 +- .../multilingual/vits_tts/train_vits_tts.py | 2 +- recipes/vctk/fast_pitch/train_fast_pitch.py | 2 +- recipes/vctk/fast_speech/train_fast_speech.py | 2 +- recipes/vctk/glow_tts/train_glow_tts.py | 2 +- .../vctk/speedy_speech/train_speedy_speech.py | 2 +- .../vctk/tacotron-DDC/train_tacotron-DDC.py | 2 +- .../vctk/tacotron2-DDC/train_tacotron2-ddc.py | 2 +- recipes/vctk/tacotron2/train_tacotron2.py | 2 +- recipes/vctk/vits/train_vits.py | 2 +- 29 files changed, 31 insertions(+), 363 deletions(-) create mode 100644 TTS/utils/audio/__init__.py rename TTS/utils/{audio.py => audio/processor.py} (64%) diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py index 84e9967f..e75ab6c4 100644 --- a/TTS/encoder/models/resnet.py +++ b/TTS/encoder/models/resnet.py @@ -1,7 +1,7 @@ import torch from torch import nn -# from TTS.utils.audio import TorchSTFT +# from TTS.utils.audio.torch_transforms import TorchSTFT from TTS.encoder.models.base_encoder import BaseEncoder diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index cb83f7ca..a476e870 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -155,10 +155,10 @@ def spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax): def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False): """ Args Shapes: - - y : :math:`[B, 1, T]` + - y : :math:`[B, 1, T_y]` Return Shapes: - - spec : :math:`[B,C,T]` + - spec : :math:`[B,C,T_spec]` """ y = y.squeeze(1) diff --git a/TTS/utils/audio/__init__.py b/TTS/utils/audio/__init__.py new file mode 100644 index 00000000..f18f2219 --- /dev/null +++ b/TTS/utils/audio/__init__.py @@ -0,0 +1 @@ +from TTS.utils.audio.processor import AudioProcessor diff --git a/TTS/utils/audio.py b/TTS/utils/audio/processor.py similarity index 64% rename from TTS/utils/audio.py rename to TTS/utils/audio/processor.py index fc9d1942..7186a7cb 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio/processor.py @@ -1,177 +1,9 @@ -from typing import Dict, Tuple - import librosa import numpy as np import pyworld as pw import scipy.io.wavfile import scipy.signal import soundfile as sf -import torch -from torch import nn - -from TTS.tts.utils.helpers import StandardScaler - - -class TorchSTFT(nn.Module): # pylint: disable=abstract-method - """Some of the audio processing funtions using Torch for faster batch processing. - - TODO: Merge this with audio.py - - Args: - - n_fft (int): - FFT window size for STFT. - - hop_length (int): - number of frames between STFT columns. - - win_length (int, optional): - STFT window length. - - pad_wav (bool, optional): - If True pad the audio with (n_fft - hop_length) / 2). Defaults to False. - - window (str, optional): - The name of a function to create a window tensor that is applied/multiplied to each frame/window. Defaults to "hann_window" - - sample_rate (int, optional): - target audio sampling rate. Defaults to None. - - mel_fmin (int, optional): - minimum filter frequency for computing melspectrograms. Defaults to None. - - mel_fmax (int, optional): - maximum filter frequency for computing melspectrograms. Defaults to None. - - n_mels (int, optional): - number of melspectrogram dimensions. Defaults to None. - - use_mel (bool, optional): - If True compute the melspectrograms otherwise. Defaults to False. - - do_amp_to_db_linear (bool, optional): - enable/disable amplitude to dB conversion of linear spectrograms. Defaults to False. - - spec_gain (float, optional): - gain applied when converting amplitude to DB. Defaults to 1.0. - - power (float, optional): - Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc. Defaults to None. - - use_htk (bool, optional): - Use HTK formula in mel filter instead of Slaney. - - mel_norm (None, 'slaney', or number, optional): - If 'slaney', divide the triangular mel weights by the width of the mel band - (area normalization). - - If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm. - See `librosa.util.normalize` for a full description of supported norm values - (including `+-np.inf`). - - Otherwise, leave all the triangles aiming for a peak value of 1.0. Defaults to "slaney". - """ - - def __init__( - self, - n_fft, - hop_length, - win_length, - pad_wav=False, - window="hann_window", - sample_rate=None, - mel_fmin=0, - mel_fmax=None, - n_mels=80, - use_mel=False, - do_amp_to_db=False, - spec_gain=1.0, - power=None, - use_htk=False, - mel_norm="slaney", - ): - super().__init__() - self.n_fft = n_fft - self.hop_length = hop_length - self.win_length = win_length - self.pad_wav = pad_wav - self.sample_rate = sample_rate - self.mel_fmin = mel_fmin - self.mel_fmax = mel_fmax - self.n_mels = n_mels - self.use_mel = use_mel - self.do_amp_to_db = do_amp_to_db - self.spec_gain = spec_gain - self.power = power - self.use_htk = use_htk - self.mel_norm = mel_norm - self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False) - self.mel_basis = None - if use_mel: - self._build_mel_basis() - - def __call__(self, x): - """Compute spectrogram frames by torch based stft. - - Args: - x (Tensor): input waveform - - Returns: - Tensor: spectrogram frames. - - Shapes: - x: [B x T] or [:math:`[B, 1, T]`] - """ - if x.ndim == 2: - x = x.unsqueeze(1) - if self.pad_wav: - padding = int((self.n_fft - self.hop_length) / 2) - x = torch.nn.functional.pad(x, (padding, padding), mode="reflect") - # B x D x T x 2 - o = torch.stft( - x.squeeze(1), - self.n_fft, - self.hop_length, - self.win_length, - self.window, - center=True, - pad_mode="reflect", # compatible with audio.py - normalized=False, - onesided=True, - return_complex=False, - ) - M = o[:, :, :, 0] - P = o[:, :, :, 1] - S = torch.sqrt(torch.clamp(M**2 + P**2, min=1e-8)) - - if self.power is not None: - S = S**self.power - - if self.use_mel: - S = torch.matmul(self.mel_basis.to(x), S) - if self.do_amp_to_db: - S = self._amp_to_db(S, spec_gain=self.spec_gain) - return S - - def _build_mel_basis(self): - mel_basis = librosa.filters.mel( - self.sample_rate, - self.n_fft, - n_mels=self.n_mels, - fmin=self.mel_fmin, - fmax=self.mel_fmax, - htk=self.use_htk, - norm=self.mel_norm, - ) - self.mel_basis = torch.from_numpy(mel_basis).float() - - @staticmethod - def _amp_to_db(x, spec_gain=1.0): - return torch.log(torch.clamp(x, min=1e-5) * spec_gain) - - @staticmethod - def _db_to_amp(x, spec_gain=1.0): - return torch.exp(x) / spec_gain # pylint: disable=too-many-public-methods @@ -398,158 +230,6 @@ class AudioProcessor(object): return AudioProcessor(verbose=verbose, **config) ### setting up the parameters ### - def _build_mel_basis( - self, - ) -> np.ndarray: - """Build melspectrogram basis. - - Returns: - np.ndarray: melspectrogram basis. - """ - if self.mel_fmax is not None: - assert self.mel_fmax <= self.sample_rate // 2 - return librosa.filters.mel( - self.sample_rate, self.fft_size, n_mels=self.num_mels, fmin=self.mel_fmin, fmax=self.mel_fmax - ) - - def _stft_parameters( - self, - ) -> Tuple[int, int]: - """Compute the real STFT parameters from the time values. - - Returns: - Tuple[int, int]: hop length and window length for STFT. - """ - factor = self.frame_length_ms / self.frame_shift_ms - assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms" - hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) - win_length = int(hop_length * factor) - return hop_length, win_length - - ### normalization ### - def normalize(self, S: np.ndarray) -> np.ndarray: - """Normalize values into `[0, self.max_norm]` or `[-self.max_norm, self.max_norm]` - - Args: - S (np.ndarray): Spectrogram to normalize. - - Raises: - RuntimeError: Mean and variance is computed from incompatible parameters. - - Returns: - np.ndarray: Normalized spectrogram. - """ - # pylint: disable=no-else-return - S = S.copy() - if self.signal_norm: - # mean-var scaling - if hasattr(self, "mel_scaler"): - if S.shape[0] == self.num_mels: - return self.mel_scaler.transform(S.T).T - elif S.shape[0] == self.fft_size / 2: - return self.linear_scaler.transform(S.T).T - else: - raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") - # range normalization - S -= self.ref_level_db # discard certain range of DB assuming it is air noise - S_norm = (S - self.min_level_db) / (-self.min_level_db) - if self.symmetric_norm: - S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm - if self.clip_norm: - S_norm = np.clip( - S_norm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type - ) - return S_norm - else: - S_norm = self.max_norm * S_norm - if self.clip_norm: - S_norm = np.clip(S_norm, 0, self.max_norm) - return S_norm - else: - return S - - def denormalize(self, S: np.ndarray) -> np.ndarray: - """Denormalize spectrogram values. - - Args: - S (np.ndarray): Spectrogram to denormalize. - - Raises: - RuntimeError: Mean and variance are incompatible. - - Returns: - np.ndarray: Denormalized spectrogram. - """ - # pylint: disable=no-else-return - S_denorm = S.copy() - if self.signal_norm: - # mean-var scaling - if hasattr(self, "mel_scaler"): - if S_denorm.shape[0] == self.num_mels: - return self.mel_scaler.inverse_transform(S_denorm.T).T - elif S_denorm.shape[0] == self.fft_size / 2: - return self.linear_scaler.inverse_transform(S_denorm.T).T - else: - raise RuntimeError(" [!] Mean-Var stats does not match the given feature dimensions.") - if self.symmetric_norm: - if self.clip_norm: - S_denorm = np.clip( - S_denorm, -self.max_norm, self.max_norm # pylint: disable=invalid-unary-operand-type - ) - S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db - return S_denorm + self.ref_level_db - else: - if self.clip_norm: - S_denorm = np.clip(S_denorm, 0, self.max_norm) - S_denorm = (S_denorm * -self.min_level_db / self.max_norm) + self.min_level_db - return S_denorm + self.ref_level_db - else: - return S_denorm - - ### Mean-STD scaling ### - def load_stats(self, stats_path: str) -> Tuple[np.array, np.array, np.array, np.array, Dict]: - """Loading mean and variance statistics from a `npy` file. - - Args: - stats_path (str): Path to the `npy` file containing - - Returns: - Tuple[np.array, np.array, np.array, np.array, Dict]: loaded statistics and the config used to - compute them. - """ - stats = np.load(stats_path, allow_pickle=True).item() # pylint: disable=unexpected-keyword-arg - mel_mean = stats["mel_mean"] - mel_std = stats["mel_std"] - linear_mean = stats["linear_mean"] - linear_std = stats["linear_std"] - stats_config = stats["audio_config"] - # check all audio parameters used for computing stats - skip_parameters = ["griffin_lim_iters", "stats_path", "do_trim_silence", "ref_level_db", "power"] - for key in stats_config.keys(): - if key in skip_parameters: - continue - if key not in ["sample_rate", "trim_db"]: - assert ( - stats_config[key] == self.__dict__[key] - ), f" [!] Audio param {key} does not match the value used for computing mean-var stats. {stats_config[key]} vs {self.__dict__[key]}" - return mel_mean, mel_std, linear_mean, linear_std, stats_config - - # pylint: disable=attribute-defined-outside-init - def setup_scaler( - self, mel_mean: np.ndarray, mel_std: np.ndarray, linear_mean: np.ndarray, linear_std: np.ndarray - ) -> None: - """Initialize scaler objects used in mean-std normalization. - - Args: - mel_mean (np.ndarray): Mean for melspectrograms. - mel_std (np.ndarray): STD for melspectrograms. - linear_mean (np.ndarray): Mean for full scale spectrograms. - linear_std (np.ndarray): STD for full scale spectrograms. - """ - self.mel_scaler = StandardScaler() - self.mel_scaler.set_stats(mel_mean, mel_std) - self.linear_scaler = StandardScaler() - self.linear_scaler.set_stats(linear_mean, linear_std) ### DB and AMP conversion ### # pylint: disable=no-self-use @@ -737,8 +417,7 @@ class AudioProcessor(object): Examples: >>> WAV_FILE = filename = librosa.util.example_audio_file() >>> from TTS.config import BaseAudioConfig - >>> from TTS.utils.audio import AudioProcessor - >>> conf = BaseAudioConfig(pitch_fmax=8000) + >>> from TTS.utils.audio.processor import AudioProcessor >>> conf = BaseAudioConfig(pitch_fmax=8000) >>> ap = AudioProcessor(**conf) >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] >>> pitch = ap.compute_f0(wav) @@ -913,15 +592,3 @@ class AudioProcessor(object): def dequantize(x, bits): """Dequantize a waveform from the given number of bits.""" return 2 * x / (2**bits - 1) - 1 - - -def _log(x, base): - if base == 10: - return np.log10(x) - return np.log(x) - - -def _exp(x, base): - if base == 10: - return np.power(10, x) - return np.exp(x) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 1f33b53e..bce7528c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -11,7 +11,7 @@ from TTS.tts.models import setup_model as setup_tts_model # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py index 848e292b..befc43cc 100644 --- a/TTS/vocoder/layers/losses.py +++ b/TTS/vocoder/layers/losses.py @@ -4,7 +4,7 @@ import torch from torch import nn from torch.nn import functional as F -from TTS.utils.audio import TorchSTFT +from TTS.utils.audio.torch_transforms import TorchSTFT from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss ################################# diff --git a/TTS/vocoder/models/univnet_discriminator.py b/TTS/vocoder/models/univnet_discriminator.py index d6b0e5d5..34e2d1c2 100644 --- a/TTS/vocoder/models/univnet_discriminator.py +++ b/TTS/vocoder/models/univnet_discriminator.py @@ -3,7 +3,7 @@ import torch.nn.functional as F from torch import nn from torch.nn.utils import spectral_norm, weight_norm -from TTS.utils.audio import TorchSTFT +from TTS.utils.audio.torch_transforms import TorchSTFT from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator LRELU_SLOPE = 0.1 diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py index 591b1509..da673e54 100644 --- a/recipes/ljspeech/align_tts/train_aligntts.py +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -7,7 +7,7 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.align_tts import AlignTTS from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index a84658f3..694552fb 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -7,7 +7,7 @@ from TTS.tts.configs.fast_pitch_config import FastPitchConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor from TTS.utils.manage import ModelManager output_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py index 0245dd93..2a525c58 100644 --- a/recipes/ljspeech/fast_speech/train_fast_speech.py +++ b/recipes/ljspeech/fast_speech/train_fast_speech.py @@ -7,7 +7,7 @@ from TTS.tts.configs.fast_speech_config import FastSpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor from TTS.utils.manage import ModelManager output_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py index a0b4ac48..85ca450c 100644 --- a/recipes/ljspeech/glow_tts/train_glowtts.py +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -12,7 +12,7 @@ from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.glow_tts import GlowTTS from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor # we use the same path as this script as our training folder. output_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py index b4cbae63..c96f721b 100644 --- a/recipes/ljspeech/hifigan/train_hifigan.py +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -2,7 +2,7 @@ import os from trainer import Trainer, TrainerArgs -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor from TTS.vocoder.configs import HifiganConfig from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.models.gan import GAN diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py index 225f5a30..e8f3d066 100644 --- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -2,7 +2,7 @@ import os from trainer import Trainer, TrainerArgs -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor from TTS.vocoder.configs import MultibandMelganConfig from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.models.gan import GAN diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py index 1ab3db1c..9d51c36a 100644 --- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py +++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py @@ -7,7 +7,7 @@ from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig( diff --git a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py index a9f253ea..ddd58bd6 100644 --- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py +++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py @@ -8,7 +8,7 @@ from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor # from TTS.tts.datasets.tokenizer import Tokenizer diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py index 99089db8..fd30943a 100644 --- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py @@ -8,7 +8,7 @@ from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor # from TTS.tts.datasets.tokenizer import Tokenizer diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py index 81d2b889..471333b3 100644 --- a/recipes/ljspeech/univnet/train.py +++ b/recipes/ljspeech/univnet/train.py @@ -2,7 +2,7 @@ import os from trainer import Trainer, TrainerArgs -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor from TTS.vocoder.configs import UnivnetConfig from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.models.gan import GAN diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py index c070b3f1..203be22c 100644 --- a/recipes/ljspeech/vits_tts/train_vits.py +++ b/recipes/ljspeech/vits_tts/train_vits.py @@ -8,7 +8,7 @@ from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples from TTS.tts.models.vits import Vits from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig( @@ -37,7 +37,7 @@ config = VitsConfig( batch_size=32, eval_batch_size=16, batch_group_size=5, - num_loader_workers=0, + num_loader_workers=8, num_eval_loader_workers=4, run_eval=True, test_delay_epochs=-1, diff --git a/recipes/ljspeech/wavegrad/train_wavegrad.py b/recipes/ljspeech/wavegrad/train_wavegrad.py index 1abdf45d..be9e0a09 100644 --- a/recipes/ljspeech/wavegrad/train_wavegrad.py +++ b/recipes/ljspeech/wavegrad/train_wavegrad.py @@ -2,7 +2,7 @@ import os from trainer import Trainer, TrainerArgs -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor from TTS.vocoder.configs import WavegradConfig from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.models.wavegrad import Wavegrad diff --git a/recipes/ljspeech/wavernn/train_wavernn.py b/recipes/ljspeech/wavernn/train_wavernn.py index 640f5092..75be20e2 100644 --- a/recipes/ljspeech/wavernn/train_wavernn.py +++ b/recipes/ljspeech/wavernn/train_wavernn.py @@ -2,7 +2,7 @@ import os from trainer import Trainer, TrainerArgs -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor from TTS.vocoder.configs import WavernnConfig from TTS.vocoder.datasets.preprocess import load_wav_data from TTS.vocoder.models.wavernn import Wavernn diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index 0e650ade..c4b62166 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -11,7 +11,7 @@ from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py index c39932da..a1d838f5 100644 --- a/recipes/vctk/fast_pitch/train_fast_pitch.py +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -8,7 +8,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py index a3249de1..3bc83950 100644 --- a/recipes/vctk/fast_speech/train_fast_speech.py +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -8,7 +8,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py index 23c02efc..f82fca63 100644 --- a/recipes/vctk/glow_tts/train_glow_tts.py +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.glow_tts import GlowTTS from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor # set experiment paths output_path = os.path.dirname(os.path.abspath(__file__)) diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py index bcd0105a..b24b2f3b 100644 --- a/recipes/vctk/speedy_speech/train_speedy_speech.py +++ b/recipes/vctk/speedy_speech/train_speedy_speech.py @@ -8,7 +8,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.forward_tts import ForwardTTS from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py index 36e28ed7..efdb150e 100644 --- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py +++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py @@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron import Tacotron from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py index d04d91c0..ea175085 100644 --- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py index 5a0e157a..76bc25d4 100644 --- a/recipes/vctk/tacotron2/train_tacotron2.py +++ b/recipes/vctk/tacotron2/train_tacotron2.py @@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index 88fd7de9..9aeb2de7 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -9,7 +9,7 @@ from TTS.tts.datasets import load_tts_samples from TTS.tts.models.vits import Vits, VitsArgs from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer -from TTS.utils.audio import AudioProcessor +from TTS.utils.audio.processor import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) dataset_config = BaseDatasetConfig(