From 942df0fb05ce70cd741d975f9d61bbfcb94e9e54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 2 Mar 2022 09:14:32 +0100 Subject: [PATCH] Update vits dataset --- TTS/tts/models/vits.py | 14 ++++++-------- TTS/utils/synthesizer.py | 20 -------------------- 2 files changed, 6 insertions(+), 28 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 04e84c62..036f22f2 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -196,14 +196,12 @@ class VitsDataset(TTSDataset): def __getitem__(self, idx): item = self.samples[idx] + raw_text = item["text"] - text, wav_file, speaker_name, language_name, _ = _parse_sample(item) - raw_text = text + wav, _ = load_audio(item["audio_file"]) + wav_filename = os.path.basename(item["audio_file"]) - wav, _ = load_audio(wav_file) - wav_filename = os.path.basename(wav_file) - - token_ids = self.get_token_ids(idx, text) + token_ids = self.get_token_ids(idx, item["text"]) # after phonemization the text length may change # this is a shameful 🤭 hack to prevent longer phonemes @@ -218,8 +216,8 @@ class VitsDataset(TTSDataset): "token_len": len(token_ids), "wav": wav, "wav_file": wav_filename, - "speaker_name": speaker_name, - "language_name": language_name, + "speaker_name": item["speaker_name"], + "language_name": item["language"], } @property diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 6821e975..d1abc907 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -126,26 +126,6 @@ class Synthesizer(object): self.encoder_checkpoint = self.tts_config.model_args.speaker_encoder_model_path self.encoder_config = self.tts_config.model_args.speaker_encoder_config_path - def _is_use_speaker_embedding(self): - """Check if the speaker embedding is used in the model""" - # we handle here the case that some models use model_args some don't - use_speaker_embedding = False - if hasattr(self.tts_config, "model_args"): - use_speaker_embedding = self.tts_config["model_args"].get("use_speaker_embedding", False) - use_speaker_embedding = use_speaker_embedding or self.tts_config.get("use_speaker_embedding", False) - return use_speaker_embedding - - def _is_use_d_vector_file(self): - """Check if the d-vector file is used in the model""" - # we handle here the case that some models use model_args some don't - use_d_vector_file = False - if hasattr(self.tts_config, "model_args"): - config = self.tts_config.model_args - use_d_vector_file = config.get("use_d_vector_file", False) - config = self.tts_config - use_d_vector_file = use_d_vector_file or config.get("use_d_vector_file", False) - return use_d_vector_file - def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None: """Load the vocoder model.