From 251e1c289dd67d919cbee35dfcfd20ab2433dc8e Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 13 Jun 2022 13:47:31 +0000 Subject: [PATCH] Add support for inference using an specific reference file instead of the averaged embeddings --- TTS/utils/synthesizer.py | 34 +++++++++++++------ ...est_vits_speaker_emb_with_emotion_train.py | 8 +++++ 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index e9a79220..716f9fb7 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -252,17 +252,24 @@ class Synthesizer(object): if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: # get the average speaker embedding from the saved d_vectors. - speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( - speaker_name, num_samples=None, randomize=False - ) + if speaker_name in self.tts_model.speaker_manager.ids: + speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( + speaker_name, num_samples=None, randomize=False + ) + else: + speaker_embedding = self.tts_model.speaker_manager.embeddings[speaker_name]["embedding"] + speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] if style_speaker_name is not None: - style_speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( - style_speaker_name, num_samples=None, randomize=False - ) - style_speaker_embedding = np.array(style_speaker_embedding)[None, :] # [1 x embedding_dim] + if style_speaker_name in self.tts_model.speaker_manager.ids: + style_speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( + style_speaker_name, num_samples=None, randomize=False + ) + else: + style_speaker_embedding = self.tts_model.speaker_manager.embeddings[style_speaker_name]["embedding"] + style_speaker_embedding = np.array(style_speaker_embedding)[None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name speaker_id = self.tts_model.speaker_manager.ids[speaker_name] @@ -322,11 +329,16 @@ class Synthesizer(object): getattr(self.tts_config, "model_args", None) and getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False) ): - # get the average speaker embedding from the saved embeddings. - emotion_embedding = self.tts_model.emotion_manager.get_mean_embedding( - emotion_name, num_samples=None, randomize=False - ) + if emotion_name in self.tts_model.emotion_manager.ids: + # get the average speaker embedding from the saved embeddings. + emotion_embedding = self.tts_model.emotion_manager.get_mean_embedding( + emotion_name, num_samples=None, randomize=False + ) + else: + emotion_embedding = self.tts_model.emotion_manager.embeddings[emotion_name]["embedding"] + emotion_embedding = np.array(emotion_embedding)[None, :] # [1 x embedding_dim] + else: # get speaker idx from the speaker name emotion_id = self.tts_model.emotion_manager.ids[emotion_name] diff --git a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py index bef67ee5..5f2bd4a9 100644 --- a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py @@ -27,6 +27,7 @@ config = VitsConfig( print_eval=True, test_sentences=[ ["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"], + ["Be a voice, not an echo.", "LJ001-0001.wav", None, None, "LJ001-0002.wav"], ], ) # set audio config @@ -83,6 +84,13 @@ continue_emotion_path = os.path.join(continue_path, "emotions.json") inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) +# inference with one reference embedding instead of average +speaker_id = "LJ001-0001.wav" +emotion_id = "LJ001-0002.wav" +inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" +run_cli(inference_command) + + # restore the model and continue training for one more epoch command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} " run_cli(command_train)