From 00a75dde297ca2499de0d4fc9d065574a95a0398 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Sat, 18 Jun 2022 12:21:15 +0200 Subject: [PATCH] Update recipes --- recipes/multilingual/vits_tts/train_vits_tts.py | 14 ++------------ recipes/thorsten_DE/vits_tts/train_vits.py | 13 ++----------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index 0e650ade..e559ea6b 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -3,11 +3,10 @@ from glob import glob from trainer import Trainer, TrainerArgs -from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs +from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.text.tokenizer import TTSTokenizer @@ -22,22 +21,13 @@ dataset_config = [ for path in dataset_paths ] -audio_config = BaseAudioConfig( +audio_config = VitsAudioConfig( sample_rate=16000, win_length=1024, hop_length=256, num_mels=80, - preemphasis=0.0, - ref_level_db=20, - log_func="np.log", - do_trim_silence=False, - trim_db=23.0, mel_fmin=0, mel_fmax=None, - spec_gain=1.0, - signal_norm=True, - do_amp_to_db_linear=False, - resample=False, ) vitsArgs = VitsArgs( diff --git a/recipes/thorsten_DE/vits_tts/train_vits.py b/recipes/thorsten_DE/vits_tts/train_vits.py index 86a7dfe6..25c57b64 100644 --- a/recipes/thorsten_DE/vits_tts/train_vits.py +++ b/recipes/thorsten_DE/vits_tts/train_vits.py @@ -2,11 +2,10 @@ import os from trainer import Trainer, TrainerArgs -from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.models.vits import Vits +from TTS.tts.models.vits import Vits, VitsAudioConfig from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor from TTS.utils.downloaders import download_thorsten_de @@ -21,21 +20,13 @@ if not os.path.exists(dataset_config.path): print("Downloading dataset") download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0]) -audio_config = BaseAudioConfig( +audio_config = VitsAudioConfig( sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, - preemphasis=0.0, - ref_level_db=20, - log_func="np.log", - do_trim_silence=True, - trim_db=45, mel_fmin=0, mel_fmax=None, - spec_gain=1.0, - signal_norm=False, - do_amp_to_db_linear=False, ) config = VitsConfig(