diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index 843cea58..4f8a6e17 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -273,7 +273,9 @@ class TTSDataset(Dataset): item = args[0] func_args = args[1] text, wav_file, *_ = item - func_args[3] = item[3] + if item[3] is not None and item[3] != '': + # If language is specified in dataset, overwrite phoneme_language key + func_args[3] = item[3] phonemes = TTSDataset._load_or_generate_phoneme_sequence(wav_file, text, *func_args) return phonemes diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index 19074ce3..a7ec2eae 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -5,7 +5,7 @@ from TTS.trainer import Trainer, TrainingArgs from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.models.vits import Vits +from TTS.tts.models.vits import Vits, VitsArgs from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor @@ -31,10 +31,14 @@ audio_config = BaseAudioConfig( resample=True, ) +vitsArgs = VitsArgs( + use_speaker_embedding=True, +) + config = VitsConfig( + model_args=vitsArgs, audio=audio_config, run_name="vits_vctk", - use_speaker_embedding=True, batch_size=32, eval_batch_size=16, batch_group_size=5,